diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..7649a5e1241acc8adf4cdf15f39b504b0787a4f7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h @@ -0,0 +1,218 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { + +struct IValue; +using Stack = std::vector; + +class OperatorHandle; +class KernelFunction; + +// This kernel implements the behavior of falling through to the next available +// registered dispatch key. The implementation of this function is FAST; it is +// no overhead to fallthrough to the next key. See cpp file for some more +// implementation notes; notably, this does NOT actually go through the +// boxing/unboxing codepath. +TORCH_API void fallthrough_kernel( + OperatorKernel* /*unused*/, + const OperatorHandle& /*unused*/, + DispatchKeySet /*unused*/, + Stack* /*unused*/); + +// Note [Ambiguity in AutogradOther kernel] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This error-reporting kernel is registered to the AutogradOther entry in the +// dispatch table when there is both a CompositeImplicitAutograd kernel and a +// backend kernel for ANY backend that maps to AutogradOther. To see why +// this is necessary in the AutogradOther case, it's helpful to first see +// why everything works out fine for a backend that has a reserved Autograd +// entry (see rule 2.2 in [Note] DispatchTable computation): +// +// CPU AutogradCPU +// reg? registers with... +// ------------------------------------------------- +// y Autograd registration takes precedence +// over CompositeImplicitAutograd. +// This is good, because the CPU specific backend +// implementation is more specialized and typically better; +// if we used the composite, we would bypass it. +// (NB: the Autograd key is guaranteed to exist because +// the autograd codegen requires it!) +// +// n CompositeImplicitAutograd takes precedence. +// This is also good, because the Autograd +// registration (if it exists) would try to redispatch +// to the (non-existent) CPU implementation; by +// using the composite, we ensure the operator +// actually works. +// +// As you can see, when we have a specific Autograd key (AutogradCPU), we can +// decide whether or not to use the CompositeImplicitAutograd kernel or the +// Autograd kernel based on whether or not the backend kernel exists. +// +// However, for AutogradOther (which is the catchall autograd kernel for +// everything that doesn't have a specific Autograd key), we can't do this +// trick because there isn't any unique backend to peek at to disambiguate; +// if there are some backends that have implementations they prefer Autograd, +// but unimplemented backends would prefer CompositeImplicitAutograd. Rather +// than arbitrarily pick one or the other, we just register a kernel that raises +// an error and let the user decide how to proceed. +TORCH_API void ambiguous_autogradother_kernel( + OperatorKernel* /*unused*/, + const OperatorHandle& /*op*/, + DispatchKeySet /*unused*/, + Stack* /*unused*/); + +// Note [named_not_supported_kernel] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// This kernel implements reporting an error message saying that named tensor is +// not supported. This kernel doesn't rely on the Stack, and so it is special +// cased in the dispatcher to be triggered before we attempt boxing (so we can +// give a good error message in cases when boxing is not supported). When +// boxing is universally supported this can be removed. +[[noreturn]] TORCH_API void named_not_supported_kernel( + OperatorKernel* /*unused*/, + const OperatorHandle& /*op*/, + DispatchKeySet /*unused*/, + Stack* /*unused*/); + +/** + * BoxedKernel is similar to a std::function storing a boxed kernel. + */ +class TORCH_API BoxedKernel final { + public: + // This is how boxed kernels are actually stored + // + // Note [Plumbing Keys Through The Dispatcher] + // Benchmarks have shown that it is expensive for the dispatcher to read from + // thread-local storage (TLS) upon every dispatch call into order to compute + // which kernel to dispatch to. + // + // To mitigate this, we've updated the calling convention inside the + // dispatcher to expect every kernel that it stores to have a first argument + // of type DispatchKeySet. + // + // What are the invariants of the DispatchKeySet when it gets passed to a + // kernel? + // - All keys to the left of the current dispatch key have been masked out. + // (e.g. a Tracing kernel that takes in the DispatchKeySet will expect the + // highest bit to be DispatchKey::Tracer) + // - All other keys that dispatcher normally would have computed through TLS + + // global state + op arguments + // are still in the set. + // + // Kernels can then opt into using this keyset to save the dispatcher from + // doing repeated work during redispatches: recalculating the highest-priority + // dispatch key, which involves reading from TLS. Instead, the kernels that + // opt in will calculate an updated DispatchKeySet directly from the old one, + // and pass the updated set directly into the dispatcher upon redispatching. + // + // This is an opt-in mechanism: Kernels can automatically opt in by setting + // the first argument in their signature to be of type DispatchKeySet. See the + // kernels in VariableTypeEverything.cpp and TraceTypeEverything.cpp for + // examples. + // + // The mechanism for optionally passing that DispatchKeySet into the kernel + // lives in make_boxed_from_unboxed_functor.h. See Note [Plumbing Keys Through + // The Dispatcher 2] for details. + using InternalBoxedKernelFunction = + void(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*); + // This is the public API for how boxed kernels are defined + using BoxedKernelFunction = void(const OperatorHandle&, Stack*); + using BoxedKernelFunction_withDispatchKeys = + void(const OperatorHandle&, DispatchKeySet, Stack*); + + BoxedKernel(); + + // Fast path for dispatch to allow not touching the boxed kernel in + // the common case where unboxed is available. + bool isValid() const; + bool isFallthrough() const; + + /** + * Call the function with boxed arguments. + */ + void callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const; + + /** + * Create a KernelFunction from a boxed function. + * + * Example: + * + * > void boxed_func(OperatorKernel*, Stack* stack) {...} + * > BoxedFunction func = BoxedKernel::makeFromFunction<&boxed_func>(); + */ + template + static BoxedKernel makeFromFunction(); + + /** + * TODO: This will only be useful if we write a backend fallback that plumbs + * dispatch keys (currently there are none) See Note [Plumbing Keys Through + * The Dispatcher] for details. + */ + template + static BoxedKernel makeFromFunction(); + + /** + * Create a KernelFunction from a boxed functor. + * + * Example: + * + * > class MyFunctor final : public c10::OperatorKernel { + * > public: + * > void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...} + * > }; + * > BoxedKernel func = + * BoxedKernel::makeFromFunctor(std::make_unique()); + */ + template + static BoxedKernel makeFromFunctor( + std::unique_ptr kernelFunctor); + + static BoxedKernel makeFallthrough(); + static BoxedKernel makeAmbiguousAutogradOther(); + static BoxedKernel makeNamedNotSupported(); + + private: + friend class KernelFunction; + + template + static void make_boxed_function( + OperatorKernel* /*unused*/, + const OperatorHandle& opHandle, + DispatchKeySet /*unused*/, + Stack* stack); + + template + static void make_boxed_function( + OperatorKernel* /*unused*/, + const OperatorHandle& opHandle, + DispatchKeySet /*ks*/, + Stack* stack); + + explicit BoxedKernel( + std::unique_ptr functor, + InternalBoxedKernelFunction* boxed_kernel_func); + + OperatorKernel* getFunctor() const; + InternalBoxedKernelFunction* getFnPtr() const; + + c10::intrusive_ptr functor_; + InternalBoxedKernelFunction* boxed_kernel_func_; +}; + +} // namespace c10 + +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..69c8b2cf65d6f0256193ee3899708ad18c7d6768 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace c10 { + +inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {} + +inline BoxedKernel::BoxedKernel( + std::unique_ptr functor, + InternalBoxedKernelFunction* boxed_kernel_func) + : functor_(std::move(functor)), boxed_kernel_func_(boxed_kernel_func) {} + +template +inline void BoxedKernel::make_boxed_function( + OperatorKernel* /*unused*/, + const OperatorHandle& opHandle, + DispatchKeySet /*unused*/, + Stack* stack) { + // Note that we're dropping the DispatchKeySet argument. + // See Note [Plumbing Keys Through The Dispatcher 2] for details. + func(opHandle, stack); +} + +template +inline void BoxedKernel::make_boxed_function( + OperatorKernel* /*unused*/, + const OperatorHandle& opHandle, + DispatchKeySet ks, + Stack* stack) { + // See Note [Plumbing Keys Through The Dispatcher 2] for details. + func(opHandle, ks, stack); +} + +inline bool BoxedKernel::isValid() const { + return boxed_kernel_func_ != nullptr; +} + +inline bool BoxedKernel::isFallthrough() const { + return boxed_kernel_func_ == &fallthrough_kernel; +} + +inline void BoxedKernel::callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + boxed_kernel_func_ != nullptr, + "Tried to call BoxedKernel::callBoxed() on an uninitialized BoxedKernel."); + (*boxed_kernel_func_)(functor_.get(), opHandle, dispatchKeySet, stack); +} + +template +inline BoxedKernel BoxedKernel::makeFromFunction() { + return BoxedKernel( + nullptr, // no functor_ object + &make_boxed_function); +} + +template +inline BoxedKernel BoxedKernel::makeFromFunction() { + return BoxedKernel( + nullptr, // no functor_ object + &make_boxed_function); +} + +inline BoxedKernel BoxedKernel::makeFallthrough() { + return BoxedKernel( + nullptr, // no functor_ object + &fallthrough_kernel); +} + +inline BoxedKernel BoxedKernel::makeAmbiguousAutogradOther() { + return BoxedKernel( + nullptr, // no functor_ object + &ambiguous_autogradother_kernel); +} + +inline BoxedKernel BoxedKernel::makeNamedNotSupported() { + return BoxedKernel( + nullptr, // no functor_ object + &named_not_supported_kernel); +} + +template +inline BoxedKernel BoxedKernel::makeFromFunctor( + std::unique_ptr kernelFunctor) { + static_assert( + std::is_base_of_v, + "Tried to call BoxedKernel::makeFromFunctor, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + return BoxedKernel( + std::move(kernelFunctor), + [](OperatorKernel* kernel, + const OperatorHandle& op, + DispatchKeySet ks, + Stack* stack) { + (*static_cast(kernel))(op, ks, stack); + }); +} + +inline OperatorKernel* BoxedKernel::getFunctor() const { + return functor_.get(); +} +inline BoxedKernel::InternalBoxedKernelFunction* BoxedKernel::getFnPtr() const { + return boxed_kernel_func_; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h new file mode 100644 index 0000000000000000000000000000000000000000..fa53454d22edd1caa9d146b6dd3a5647a0b7dfee --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h @@ -0,0 +1,346 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack + // to the c10 namespace. + +class OperatorHandle; +struct OperatorKernel; +class KernelFunction; + +class KernelToken; +class SafeKernelFunction; + +template +using has_symint = std::disjunction< + std::is_same, + std::is_same, + std::is_same, + std::is_same, T>>; + +template +struct remove_symint { + using type = T; +}; + +template <> +struct remove_symint { + using type = int64_t; +}; + +template <> +struct remove_symint { + using type = OptionalIntArrayRef; +}; + +template <> +struct remove_symint { + using type = c10::IntArrayRef; +}; + +template <> +struct remove_symint> { + using type = std::optional; +}; + +template +struct maybe_keep_symint final {}; + +template +struct maybe_keep_symint { + using type = T; +}; + +template +struct maybe_keep_symint { + using type = typename remove_symint::type; +}; + +template +using fn_has_symint = typename guts::typelist::true_for_any_type< + has_symint, + typename guts::infer_function_traits::type::parameter_types>; + +template +struct fn_remove_symint; + +template +struct fn_remove_symint { + using type = Ret(typename remove_symint::type...); +}; + +/** + * KernelFunction is similar to std::function but stores a kernel function. + * You can create a KernelFunction from a boxed or unboxed + * function/functor/lambda and call it in a boxed or unboxed way. If the way it + * was created doesn't match the way it was called, it will do boxing or + * unboxing as necessary. + */ +class TORCH_API KernelFunction final { + public: + using InternalBoxedKernelFunction = BoxedKernel::InternalBoxedKernelFunction; + using BoxedKernelFunction = BoxedKernel::BoxedKernelFunction; + using BoxedKernelFunction_withDispatchKeys = + BoxedKernel::BoxedKernelFunction_withDispatchKeys; + + KernelFunction(); + ~KernelFunction(); + + KernelFunction(const KernelFunction& other); + KernelFunction& operator=(const KernelFunction& other); + + KernelFunction(KernelFunction&&) noexcept = default; + + // Fast path for dispatch to allow not touching the boxed kernel in + // the common case where unboxed is available. + bool isValidUnboxed() const; + bool isValidSymUnboxed() const; + bool isValid() const; + bool isFallthrough() const; + + /** + * Call the function in a boxed way. + * If the kernel function was created with an unboxed function, + * this will call an unboxing wrapper which then calls into that + * unboxed function. + * + * Example: + * + * > void boxed_func(OperatorKernel*, Stack* stack) {...} + * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func); + * > Tensor result = func.callBoxed(stack); + * + * Or, with an unboxed implementation: + * + * > KernelFunction func = KernelFunction::makeFromUnboxedLambda( + * > [] (Tensor a, bool b) -> Tensor {...}); + * > Tensor result = func.callBoxed(stack); + */ + void callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const; + + /** + * Call the function in an unboxed way. + * If the kernel function was created with a boxed function, + * this will box all inputs and then call into that boxed function. + * + * Note that this doesn't work for all types yet. + * + * Example: + * + * > KernelFunction func = KernelFunction::makeFromUnboxedLambda( + * > [] (Tensor a, bool b) -> Tensor {...}); + * > Tensor result = func.call(tensor1, true); + * + * Or, with a boxed implementation: + * + * > void boxed_func(OperatorKernel*, Stack* stack) {...} + * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func); + * > Tensor result = func.call(tensor1, true); + */ + template + Return call( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Args... args) const; + + /** + * Create a KernelFunction from a BoxedKernel. + */ + static KernelFunction makeFromBoxedKernel(BoxedKernel boxed_fn); + + /** + * Create a KernelFunction from a boxed function. + * + * Example: + * + * > void boxed_func(OperatorKernel*, Stack* stack) {...} + * > KernelFunction func = + * KernelFunction::makeFromBoxedFunction<&boxed_func>(); + */ + template + static KernelFunction makeFromBoxedFunction(); + + /** + * TODO: This will only be useful if we write a backend fallback that plumbs + * dispatch keys (currently there are none) See Note [Plumbing Keys Through + * The Dispatcher] for details. + */ + template + static KernelFunction makeFromBoxedFunction(); + + /** + * Create a KernelFunction from an unboxed functor. + * + * Example: + * + * > class MyFunctor final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > KernelFunction func = + * KernelFunction::makeFromUnboxedFunctor(std::make_unique()); + */ + template + static KernelFunction makeFromUnboxedFunctor( + std::unique_ptr kernelFunctor); + + /** + * Create a KernelFunction from a boxed functor. + * + * Example: + * + * > class MyFunctor final : public c10::OperatorKernel { + * > public: + * > void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...} + * > }; + * > KernelFunction func = + * KernelFunction::makeFromBoxedFunctor(std::make_unique()); + */ + template + static KernelFunction makeFromBoxedFunctor( + std::unique_ptr kernelFunctor); + + /** + * Create a KernelFunction from an unboxed function. + * This is usually better than KernelFunction::makeFromUnboxedRuntimeFunction + * because knowing the function pointer as a template argument (i.e. at + * compile time) allows the compiler to inline the function into its + * unboxing wrapper and yields better performance when calling the function. + * + * Example: + * + * > Tensor unboxed_func(Tensor a, Tensor b) {...} + * > KernelFunction func = + * KernelFunction::makeFromUnboxedFunction(); + */ + template + static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/); + + /** + * Create a KernelFunction from an unboxed function. + * KernelFunction::makeFromUnboxedFunction is usually a better choice than + * this if you know the function pointer at compile time, see doc comment + * there for an explanation. + * + * Example: + * + * > Tensor unboxed_func(Tensor a, Tensor b) {...} + * > KernelFunction func = + * KernelFunction::makeFromUnboxedRuntimeFunction(&unboxed_func); + */ + template + static KernelFunction makeFromUnboxedRuntimeFunction(FuncType* func); + + static KernelFunction makeFallthrough(); + static KernelFunction makeAmbiguousAutogradOther(); + static KernelFunction makeNamedNotSupported(); + + /** + * Create a KernelFunction from an unboxed lambda. + * + * Example: + * + * > KernelFunction func = KernelFunction::makeFromUnboxedLambda( + * > [] (Tensor a, bool b) -> Tensor {...}); + */ + template + static std::enable_if_t< + guts::is_stateless_lambda>::value, + KernelFunction> + makeFromUnboxedLambda(Lambda&& lambda); + template + static std::enable_if_t< + !guts::is_stateless_lambda>::value, + KernelFunction> + makeFromUnboxedLambda(Lambda&& lambda); + + std::string dumpState() const; + // For testing internal invariants only + bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const; + + // Register a token to be invalidated when this KernelFunction is destroyed + void registerToken(std::weak_ptr token) const; + + private: + explicit KernelFunction( + std::unique_ptr functor, + InternalBoxedKernelFunction* boxed_kernel_func, + void* unboxed_kernel_func, + void* sym_unboxed_kernel_func); + explicit KernelFunction( + BoxedKernel boxed_fn, + void* unboxed_kernel_func, + void* sym_unboxed_kernel_func); + + BoxedKernel boxed_kernel_func_; + void* unboxed_kernel_func_; + void* sym_unboxed_kernel_func_; + // List of tokens that need to be invalidated when this KernelFunction is + // destroyed (lazy allocation to save memory when empty) + mutable std::unique_ptr>> tokens_; +}; + +// Token held by SafeKernelFunction that gets invalidated when KernelFunction is +// destroyed +class KernelToken { + public: + bool isValid() const; + void invalidate(); + + private: + std::atomic invalid_{false}; +}; + +class SafeKernelFunction { + public: + SafeKernelFunction( + const KernelFunction* kernel, + std::string debug, + std::shared_ptr opHandle); + + // Safe callBoxed - checks token validity first + void callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const; + + // Get debug information + const std::string& debug() const { + return debug_; + } + + // Get the OpHandle that lives on this SafeKernelFunction + const OperatorHandle& opHandle() const { + return *opHandle_; + } + + private: + KernelFunction kernel_; + std::shared_ptr token_; + std::string debug_; + std::shared_ptr opHandle_; +}; + +} // namespace c10 + +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..1d190e1809da3abeeff6b5ded93cf1694fef94f6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h @@ -0,0 +1,395 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include + +#include +#include + +namespace c10 { + +namespace detail { +template +std::enable_if_t< + !std::is_array_v && !std::is_array_v && + std::is_base_of_v, + std::unique_ptr> +make_unique_base(Args&&... args) { + return std::make_unique(std::forward(args)...); +} +} // namespace detail + +inline KernelFunction::KernelFunction() + : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {} + +inline KernelFunction::~KernelFunction() { + if (tokens_) { + for (auto& weak_token : *tokens_) { + if (auto token = weak_token.lock()) { + token->invalidate(); + } + } + } +} + +inline KernelFunction::KernelFunction(const KernelFunction& other) + : boxed_kernel_func_(other.boxed_kernel_func_), + unboxed_kernel_func_(other.unboxed_kernel_func_), + sym_unboxed_kernel_func_(other.sym_unboxed_kernel_func_) { + // tokens_ is intentionally not copied as we only care about invalidating + // tokens if the original KernelFunction is destroyed +} + +inline KernelFunction& KernelFunction::operator=(const KernelFunction& other) { + if (this != &other) { + boxed_kernel_func_ = other.boxed_kernel_func_; + unboxed_kernel_func_ = other.unboxed_kernel_func_; + sym_unboxed_kernel_func_ = other.sym_unboxed_kernel_func_; + + // tokens_ is intentionally not copied as we only care about invalidating + // tokens if the original KernelFunction is destroyed + } + return *this; +} + +inline KernelFunction::KernelFunction( + std::unique_ptr functor, + InternalBoxedKernelFunction* boxed_kernel_func, + void* unboxed_kernel_func, + void* sym_unboxed_kernel_func = nullptr) + : boxed_kernel_func_(std::move(functor), boxed_kernel_func), + unboxed_kernel_func_(unboxed_kernel_func), + sym_unboxed_kernel_func_(sym_unboxed_kernel_func) {} + +inline KernelFunction::KernelFunction( + BoxedKernel boxed_fn, + void* unboxed_kernel_func, + void* sym_unboxed_kernel_func = nullptr) + : boxed_kernel_func_(std::move(boxed_fn)), + unboxed_kernel_func_(unboxed_kernel_func), + sym_unboxed_kernel_func_(sym_unboxed_kernel_func) {} + +inline bool KernelFunction::isValidUnboxed() const { + return unboxed_kernel_func_ != nullptr; +} + +inline bool KernelFunction::isValidSymUnboxed() const { + return sym_unboxed_kernel_func_ != nullptr; +} + +inline bool KernelFunction::isValid() const { + return boxed_kernel_func_.isValid(); +} + +inline bool KernelFunction::isFallthrough() const { + return boxed_kernel_func_.isFallthrough(); +} + +inline void KernelFunction::callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const { + boxed_kernel_func_.callBoxed(opHandle, dispatchKeySet, stack); +} + +template +inline Return callUnboxedKernelFunction( + void* unboxed_kernel_func, + OperatorKernel* functor, + DispatchKeySet dispatchKeySet, + Args&&... args) { + using ActualSignature = Return(OperatorKernel*, DispatchKeySet, Args...); + ActualSignature* func = + reinterpret_cast(unboxed_kernel_func); + return (*func)(functor, dispatchKeySet, std::forward(args)...); +} + +// This template requires you to explicitly specify the argument you want to +// forward; it doesn't work if you try to deduce it +// NB: keep this in sync with cloneWithRealTypes in function_schema.cpp + +template +inline typename remove_symint::type unpackSymInt(T x) { + return x; +} + +template <> +inline remove_symint::type unpackSymInt(c10::SymInt x) { + return x.guard_int(__FILE__, __LINE__); +} + +template <> +inline remove_symint::type unpackSymInt( + c10::SymIntArrayRef x) { + return C10_AS_INTARRAYREF_SLOW(x); +} + +template <> +inline remove_symint>::type unpackSymInt( + std::optional x) { + return x.has_value() ? std::make_optional(x->guard_int(__FILE__, __LINE__)) + : std::nullopt; +} + +template <> +inline remove_symint::type unpackSymInt( + at::OptionalSymIntArrayRef x) { + return x.has_value() ? std::make_optional(C10_AS_INTARRAYREF_SLOW(*x)) + : std::nullopt; +} + +template +C10_ALWAYS_INLINE Return KernelFunction::call( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Args... args) const { + // note: Args above is intentionally not Args&&. We don't want perfect + // forwarding, which would require Args to be deduced, but instead we + // want callers to explicitly specify the Args. + + if constexpr (std::disjunction_v...>) { + if (sym_unboxed_kernel_func_ != nullptr) { + auto* functor = boxed_kernel_func_.getFunctor(); + return callUnboxedKernelFunction( + sym_unboxed_kernel_func_, + functor, + dispatchKeySet, + std::forward(args)...); + } + + if (unboxed_kernel_func_ != nullptr) { + auto* functor = boxed_kernel_func_.getFunctor(); + return callUnboxedKernelFunction< + Return, + typename remove_symint::type...>( + unboxed_kernel_func_, + functor, + dispatchKeySet, + unpackSymInt(args)...); + } + } else { + if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) { + auto* functor = boxed_kernel_func_.getFunctor(); + return callUnboxedKernelFunction( + unboxed_kernel_func_, + functor, + dispatchKeySet, + std::forward(args)...); + } + } + + return impl::BoxedKernelWrapper::call( + boxed_kernel_func_, + opHandle, + dispatchKeySet, + std::forward(args)...); +} + +inline void KernelFunction::registerToken( + std::weak_ptr token) const { + if (!tokens_) { + tokens_ = std::make_unique>>(); + } + tokens_->push_back(std::move(token)); +} + +inline KernelFunction KernelFunction::makeFromBoxedKernel( + BoxedKernel boxed_fn) { + return KernelFunction( + std::move(boxed_fn), nullptr); // no unboxed function pointer +} + +template +inline KernelFunction KernelFunction::makeFromBoxedFunction() { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeFromFunction()); +} + +template +inline KernelFunction KernelFunction::makeFromBoxedFunction() { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeFromFunction()); +} + +inline KernelFunction KernelFunction::makeFallthrough() { + return KernelFunction::makeFromBoxedKernel(BoxedKernel::makeFallthrough()); +} + +inline KernelFunction KernelFunction::makeAmbiguousAutogradOther() { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeAmbiguousAutogradOther()); +} + +inline KernelFunction KernelFunction::makeNamedNotSupported() { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeNamedNotSupported()); +} + +template +inline KernelFunction KernelFunction::makeFromUnboxedFunctor( + std::unique_ptr kernelFunctor) { +#ifndef NDEBUG + // This assertion is costly for build time so it's debug-gated. + static_assert( + guts::is_functor::value, + "Tried to call KernelFunction::makeFromUnboxedFunctor but the argument is not a functor."); +#endif + static_assert( + std::is_base_of_v, + "Tried to call KernelFunction::makeFromUnboxedFunctor, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + + auto* unboxed_fn = &impl::wrap_kernel_functor_unboxed::call; + void* void_unboxed_fn = reinterpret_cast(unboxed_fn); + bool is_symint = fn_has_symint::value; + return KernelFunction( + std::move(kernelFunctor), + &impl::make_boxed_from_unboxed_functor:: + call, + is_symint ? nullptr : void_unboxed_fn, + is_symint ? void_unboxed_fn : nullptr); +} + +template +inline KernelFunction KernelFunction::makeFromBoxedFunctor( + std::unique_ptr kernelFunctor) { + return KernelFunction::makeFromBoxedKernel( + BoxedKernel::makeFromFunctor(std::move(kernelFunctor))); +} + +template +inline KernelFunction KernelFunction::makeFromUnboxedFunction( + FuncPtr func_ptr) { + static_assert( + is_compile_time_function_pointer::value, + "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN."); + static_assert( + !std::is_same_v, + "Tried to call KernelFunction::makeFromUnboxedFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead."); +#if defined(__GNUC__) && defined(__SANITIZE_ADDRESS__) && !defined(__CUDACC__) + TORCH_INTERNAL_ASSERT( + FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr"); +#else + static_assert( + FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr"); +#endif + +#if !defined(C10_MOBILE) + (void)func_ptr; // Suppress unused variable warning + return makeFromUnboxedFunctor< + AllowLegacyTypes, + typename impl::WrapFunctionIntoFunctor::type>( + detail::make_unique_base< + OperatorKernel, + typename impl::WrapFunctionIntoFunctor::type>()); +#else + // On mobile, we rather want to optimize for binary size than for performance, + // so let's not inline the kernel into the wrapper but use + // makeFromUnboxedRuntimeFunction instead. + return makeFromUnboxedRuntimeFunction(func_ptr.func_ptr()); +#endif +} + +template +inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction( + FuncType* func) { + static_assert( + guts::is_function_type::value, + "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type."); + static_assert( + !std::is_same_v, + "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead."); + TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr"); + + return makeFromUnboxedFunctor< + AllowLegacyTypes, + impl::WrapFunctionIntoRuntimeFunctor>>( + detail::make_unique_base< + OperatorKernel, + impl::WrapFunctionIntoRuntimeFunctor>>(func)); +} + +template +inline std::enable_if_t< + guts::is_stateless_lambda>::value, + KernelFunction> +KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) { + static_assert( + guts::is_functor>::value, + "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type."); + +#if !defined(C10_MOBILE) + return makeFromUnboxedFunctor< + AllowLegacyTypes, + impl::WrapFunctionIntoRuntimeFunctor>>( + detail::make_unique_base< + OperatorKernel, + impl::WrapFunctionIntoRuntimeFunctor>>( + std::forward(lambda))); +#else + // On mobile, we rather want to optimize for binary size than for performance, + // so let's not inline the kernel into the wrapper but use + // makeFromUnboxedRuntimeFunction instead. + using FuncType = + typename guts::infer_function_traits_t>::func_type; + return makeFromUnboxedRuntimeFunction(lambda); +#endif +} + +template +inline std::enable_if_t< + !guts::is_stateless_lambda>::value, + KernelFunction> +KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) { + static_assert( + guts::is_functor>::value, + "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type."); + + return makeFromUnboxedFunctor< + AllowLegacyTypes, + impl::WrapFunctionIntoRuntimeFunctor>>( + detail::make_unique_base< + OperatorKernel, + impl::WrapFunctionIntoRuntimeFunctor>>( + std::forward(lambda))); +} + +inline bool KernelToken::isValid() const { + return !invalid_.load(std::memory_order_acquire); +} + +inline void KernelToken::invalidate() { + invalid_.store(true, std::memory_order_release); +} + +inline SafeKernelFunction::SafeKernelFunction( + const KernelFunction* kernel, + std::string debug, + std::shared_ptr opHandle) + : kernel_(kernel ? *kernel : KernelFunction()), + token_(std::make_shared()), + debug_(std::move(debug)), + opHandle_(std::move(opHandle)) { + // Register the token with the original kernel so it gets invalidated when the + // kernel is destroyed + if (kernel) { + kernel->registerToken(token_); + } +} + +inline void SafeKernelFunction::callBoxed( + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Stack* stack) const { + TORCH_CHECK( + token_ && token_->isValid(), + "SafeKernelFunction has been invalidated ", + debug_); + kernel_.callBoxed(opHandle, dispatchKeySet, stack); +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..5bf328983091cf4e02f66e60462c5b9ffb082462 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace c10 { + +/** + * Inherit from OperatorKernel to implement a c10 kernel. + * + * Example: + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * + * The kernel class is allowed to have members but these are equivalent + * to global variables. The kernel implementation is responsible for + * preventing race conditions on them. + * + * See below for how to register this kernel with PyTorch. + */ +struct TORCH_API OperatorKernel : public c10::intrusive_ptr_target { + ~OperatorKernel() override = default; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h new file mode 100644 index 0000000000000000000000000000000000000000..aa1e5eb02d879ff1ca90a0261369e9b3e3ead4b2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10::impl { +namespace detail { +template +class WrapFunctionIntoFunctor_ {}; +template +class WrapFunctionIntoFunctor_< + FuncPtr, + ReturnType, + guts::typelist::typelist> + final : public c10::OperatorKernel { + public: + C10_ALWAYS_INLINE decltype(auto) operator()(Parameters... args) { + return (*FuncPtr::func_ptr())(std::forward(args)...); + } +}; +} // namespace detail + +// WrapFunctionIntoFunctor: Wraps a compile time function pointer into a kernel +// functor. Since it is a compile time function pointer, many compilers can +// inline it into the wrapper and you don't get any performance overhead for +// wrapping. +template +struct WrapFunctionIntoFunctor final { + static_assert( + c10::is_compile_time_function_pointer::value, + "WrapFunctionIntoFunctor can only wrap functions created with TORCH_FN."); + using type = detail::WrapFunctionIntoFunctor_< + FuncPtr, + typename guts::function_traits::return_type, + typename guts::function_traits< + typename FuncPtr::FuncType>::parameter_types>; +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h new file mode 100644 index 0000000000000000000000000000000000000000..0ff4e3dbc917c8dc86605c403c3733539c4779db --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h @@ -0,0 +1,46 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10::impl { + +namespace detail { +template +class WrapFunctionIntoRuntimeFunctor_ {}; +template +class WrapFunctionIntoRuntimeFunctor_< + FuncType, + ReturnType, + guts::typelist::typelist> + final : public c10::OperatorKernel { + public: + template + explicit WrapFunctionIntoRuntimeFunctor_(FuncType_&& kernel_func) + : kernel_func_(std::forward(kernel_func)) {} + + decltype(auto) operator()(Parameters... args) { + return kernel_func_(std::forward(args)...); + } + + private: + FuncType kernel_func_; +}; +} // namespace detail + +// WrapFunctionIntoRuntimeFunctor: Wraps any runtime functor into a functor that +// inherits from c10::OperatorKernel, so it can be used as a c10 kernel. +// This can, for example, be used for lambdas, functors or even function +// pointers. In the case of function pointers, since it is a runtime function +// pointer, there is an overhead for calling it whenever the kernel is invoked. +template +using WrapFunctionIntoRuntimeFunctor = detail::WrapFunctionIntoRuntimeFunctor_< + FuncType, + typename guts::infer_function_traits_t::return_type, + typename guts::infer_function_traits_t::parameter_types>; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h new file mode 100644 index 0000000000000000000000000000000000000000..ed93dfef4637046783ab9d7e88c7919e9fc75d04 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h @@ -0,0 +1,415 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// This file contains boxing (not unboxing) logic, +// i.e. how to make a vector from a set of concrete arguments. + +#include +#include +#include + +#include + +#include +#include + +namespace c10::impl { + +// +// utils +// + +// is_mutable_tensor_ref +template +struct is_mutable_tensor_ref : std::false_type {}; +template <> +struct is_mutable_tensor_ref : std::true_type {}; + +// is_tuple_of_mutable_tensor_refs +// +template +struct is_tuple_of_mutable_tensor_refs : std::false_type {}; + +template +struct is_tuple_of_mutable_tensor_refs< + T, + std::enable_if_t::value, void>> + : guts::typelist:: + all> {}; + +// has_ivalue_to tests the presence/absence of instance method +// IValue::to() +// +template +struct has_ivalue_to : std::false_type {}; + +template +struct ivalue_to_helper { + using type = decltype(std::declval().template to()); +}; +template +using ivalue_to_helper_t = typename ivalue_to_helper::type; + +template +struct has_ivalue_to>> : std::true_type {}; + +// +// boxing predicates +// + +// A boxable arg type is one that IValue has a constructor for. +template +using can_box = std::disjunction< + std::is_constructible>, + // TensorOptions are not directly constructible into IValue, + // but torch::jit::push knows how to handle them + std::is_same>>; + +template +using can_box_all = std::conjunction...>; + +// an unboxable result is one that can be extracted from an IValue +template +using can_unbox = std::conjunction< + std::disjunction< + has_ivalue_to, + // void returns are ok + std::is_same>, + std::negation>>; + +// +// boxArgs - utility for pushing unboxed args onto IValue stack +// +template +torch::jit::Stack boxArgs(Args... args) { + // TODO Reuse stack vector instead of allocating? + torch::jit::Stack stack; + stack.reserve(sizeof...(Args)); + torch::jit::push(stack, std::forward(args)...); + return stack; +} + +template +inline constexpr size_t boxed_size_one() { + static_assert( + !std::is_same_v, c10::TensorOptions>, + "need to patch this path to support TensorOptions passed by reference"); + return 1; +} + +// torch::jit::push pushes 4 values for a TensorOptions; this needs to +// be kept in sync. +template <> +inline constexpr size_t boxed_size_one() { + return 4; +} + +// NOTE: this could probably be simplified with C++17 fold expressions. +template +struct BoxedSize : std::integral_constant {}; +template +struct BoxedSize + : std::integral_constant< + size_t, + boxed_size_one() + BoxedSize::value> {}; + +template +static inline constexpr size_t boxed_size() { + return BoxedSize::value; +} + +template +C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(IValue*& dest, T& arg) { + new (dest++) IValue(arg); +} + +C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack( + IValue*& dest, + c10::TensorOptions options) { + new (dest++) IValue(c10::typeMetaToScalarType(options.dtype())); + new (dest++) IValue(options.layout()); + new (dest++) IValue(options.device()); + new (dest++) IValue(options.pinned_memory()); +} + +inline void boxArgsToStack(IValue*& /*unused*/) {} + +template +C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack( + IValue*& dest, + T& arg, + Args&... args) { + boxToStack(dest, arg); + boxArgsToStack(dest, args...); +} + +// +// PopResult is a helper class whose specializations handle popping single and +// multiple return values, respectively. +// +template +struct PopResult final { + static Result call(Stack& stack) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == 1, + "Boxed kernel was expected to return one value on the stack, ", + "but instead pushed ", + stack.size(), + " values."); + return std::move(stack[0]).to(); + } +}; + +template +struct PopResult> final { + using Result = std::tuple; + + static Result call(Stack& stack) { + // for tuple return types, boxed kernel has pushed multiple values onto the + // stack + constexpr int RetCount = sizeof...(Types); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == RetCount, + "Boxed kernel was expected to return ", + RetCount, + " values on the stack, ", + "but instead pushed ", + stack.size(), + " values."); + return pop_to_tuple_impl(stack, std::make_index_sequence()); + } + + private: + // note: this has been moved into its own helper only to avoid a parse error + // on `indices` otherwise. I'm sure there's an incantation that slips it past + // the parser but eh + template + static Result pop_to_tuple_impl( + Stack& stack, + std::index_sequence /*unused*/) { + return std::make_tuple((std::move(stack[indices]).template to())...); + } +}; + +// +// BoxedKernelWrapper +// +// For a given function type FT, BoxedKernelWrapper implements +// a `call` method that +// - takes a boxed kernel and unboxed arguments as specified by FT, +// - calls `boxArgs` to box the arguments +// - calls the boxed kernel +// - unboxes and returns the result +// +// The partial specializations below handle various cases: in +// particular, not all types appearing in op signatures are supported, +// and ops returning references have nonstandard wrapper implementations. +// + +// 1. The base specialization of BoxedKernelWrapper should never be +// instantiated. A "no call method defined on BoxedKernelWrapper" compile error +// means that an op signature has failed to trigger any of the partial +// specializations that follow this one. +// +template +struct BoxedKernelWrapper { + // The reason we're not just doing straight up static_assert(false, ...) here: + // Basically, the way to make sure a static_assert only fires if a template + // is actually instantiated (rather than every time the file is parsed) is to + // use template parameters in the expression, e.g. FuncType here. However, + // since `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the + // same effect. + static_assert( + sizeof(FuncType) != sizeof(FuncType), + "Function signature contains one or more unsupported parameter and/or return types. " + "Look for a nearby error like " + "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" " + "- (your function type) is the unsupported signature."); +}; + +// +// 2. Supported signatures, other than those involving non-const Tensor refs - +// i.e., "functional" ops. +// + +template +struct BoxedKernelWrapper< + Result(Args...), + std::enable_if_t< + can_box_all::value && can_unbox::value && + !is_tuple_of_mutable_tensor_refs::value, + void>> { + static Result call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Args... args) { + torch::jit::Stack stack = boxArgs(std::forward(args)...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + + if constexpr (!std::is_same_v) { + // op has pushed one or more values onto the stack. + return PopResult::call(stack); + } else { + // op returns void, boxed kernel has pushed nothing onto stack. + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.empty(), + "Boxed kernel was expected to return no values on the stack, ", + "but instead returned ", + stack.size(), + " values."); + } + } +}; + +// +// 3. in-place ops take a single non-const Tensor reference +// as their first argument, and return it. +// +// Note: all signatures matching this pattern are assumed to be for such ops. +// Because of this, the generated BoxedKernelWrapper specializations simply +// return the in-place argument. +// + +template +struct BoxedKernelWrapper< + at::Tensor&(at::Tensor&, OtherArgs...), + std::enable_if_t::value, void>> { + static at::Tensor& call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + at::Tensor& outArg, + OtherArgs... otherArgs) { + torch::jit::Stack stack = boxArgs( + outArg, std::forward(otherArgs)...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == 1, + "Boxed kernel was expected to return a single value on the stack, ", + "but instead returned ", + stack.size(), + " values."); + + return outArg; + } +}; + +// +// 3.5. In-process migration to make in-place ops take and return +// const references instead. +template +struct BoxedKernelWrapper< + const at::Tensor&(const at::Tensor&, OtherArgs...), + std::enable_if_t::value, void>> { + static const at::Tensor& call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + const at::Tensor& outArg, + OtherArgs... otherArgs) { + torch::jit::Stack stack = boxArgs(outArg, otherArgs...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == 1, + "Boxed kernel was expected to return a single value on the stack, ", + "but instead returned ", + stack.size(), + " values."); + + return outArg; + } +}; + +// +// 4. out of place ops that take a single non-const Tensor reference as their +// final argument, and also return it. +// +// Note: all signatures matching this pattern are assumed to be for such ops. +// This assumption permits the generated BoxedKernelWrapper specializations to +// simply return out arguments. +// +template +struct BoxedKernelWrapper< + at::Tensor&(FirstArg, RestArgs...), + std::enable_if_t< + can_box_all::value + // this skips over in-place kernels with a non-const Tensor + // arg at the front, so those can unambiguously trigger the + // preceding specialization. + && !is_mutable_tensor_ref::value, + void>> { + static at::Tensor& call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + FirstArg firstArg, + RestArgs... restArgs) { + torch::jit::Stack stack = boxArgs( + std::forward(firstArg), std::forward(restArgs)...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == 1, + "Boxed kernel was expected to return a single value on the stack, ", + "but instead returned ", + stack.size(), + " values."); + + // reusing restArgs after it has been forwarded here is ok because we know + // that the last element is of type `Tensor&`. + return std::get( + std::tuple{restArgs...}); + } +}; + +// +// 5. out of place ops that take multiple non-const Tensor references as their +// final arguments, and return them in a std::tuple. +// +// Note: all signatures matching this pattern are assumed to be for such ops. +// This assumption permits the generated BoxedKernelWrapper specializations to +// simply return the out arguments. +// +template +struct BoxedKernelWrapper< + Result(Args...), + std::enable_if_t< + can_box_all::value && + is_tuple_of_mutable_tensor_refs::value, + void>> { + static Result call( + const BoxedKernel& boxed_kernel_func, + const OperatorHandle& opHandle, + DispatchKeySet dispatchKeySet, + Args... args) { + using ArgTuple = std::tuple; + constexpr int RetCount = std::tuple_size(); + + torch::jit::Stack stack = boxArgs(std::forward(args)...); + boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == RetCount, + "Boxed kernel was expected to return ", + RetCount, + " values on the stack, ", + "but instead returned ", + stack.size(), + " values."); + + // reusing args after it has been forwarded here is ok because we know + // that the last RetCount elements are of type `Tensor&`. + auto result = guts::tuple_take( + ArgTuple{std::forward(args)...}); + static_assert( + std::is_same_v, + "The parameter list of an op returning a tuple of Tensor references " + "must end with an equal number of Tensor reference parameters."); + return result; + } +}; + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..fac192e893c4fc6d40714ebc2d24f848d736819a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h @@ -0,0 +1,790 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace c10 { + +using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack + // to the c10 namespace. +class OperatorHandle; + +/* + * [Note: Argument forwarding in the dispatcher] + * + * The dispatcher uses a somewhat unusual way to forward arguments through + * several layers of wrapper functions. This can be confusing because an + * experienced C++ programmer would look at this and think "oh this is supposed + * to be forwarding a universal reference but the && is missing. This is a + * bug.". It is not a bug. The common way in C++ to forward arguments is to use + * universal references: + * + * > template void func(T&& arg) { func2(std::forward(arg)); } + * + * but that relies on inferring the correct reference type (i.e. value vs & vs + * &&) from the argument. In our case, we cannot rely on the argument as + * supplied by the caller, because that could infer a different reference type + * than was used in the kernel function. The correct reference type is dictated + * by the kernel signature and must be identical since we cast function pointers + * through void* pointers and mismatches would be UB. So we need a forwarding + * pattern that determines the reference type to use by looking at the + * explicitly supplied operator signature, not by looking at the argument we're + * calling it with. + * + * What does std::forward do, exactly? + * ------------------------------------ + * std::forward(t) is a way to cast t to the reference type supplied in T. + * Let's assume decay_t == U and T is either U or some reference of U. + * - std::forward(t) will return U&, no matter what kind of reference t is. + * - std::forward(t) will return U&&, no matter what kind of reference t + * is. + * - std::forward(t) will return U&& (not U!), no matter what kind of + * reference t is. + * + * For universal references, that means that in the following function + * > template void func(T&& arg) { func2(std::forward(arg)); } + * + * - when called with arg being a rvalue reference or non-reference value, T + * gets inferred to be a non-reference U, and std::forward(t) will return + * U&&, correctly moving the argument. + * - when called with arg behind a lvalue reference, T gets inferred to be U& + * because that's the only way to match the signature (in C++, a type that is + * (T&)&& will collapse to T&). That means std::forward(t) will return U& and + * the value will not be moved but passed on as a lvalue reference. + * + * How do we use that? + * ------------------------------------ + * But std::forward can also be used outside of the common "universal + * forwarding" pattern to change reference types. So instead of following the + * common C++ pattern, we notice what std::forward() actually does, and that + * is it takes a value and changes its reference to the type of reference passed + * in as T. If we don't infer T but explicitly specify it, we can use this to + * forward based on an explicitly specified reference type instead of the + * inferred argument type. + * + * This is why many of the dispatcher functions look like + * > template func(T t) { func2(std::forward(t)); } + * instead of the common + * > template func(T&& t) { func2(std::forward(t)); } + * + * and are expected to be called by explicitly specifying the template + * parameters in a way that matches the expected operator signature at each call + * site. + */ + +namespace impl { +// supported_primitive_arg_types defines which primitive types we allow in +// kernel functions as arguments or returns. +// Additionally, we support lists, dicts and optionals containing these types. +using supported_primitive_arg_types = guts::typelist::typelist< + int64_t, + double, + bool, + std::string_view, + at::Tensor, + at::Scalar, + c10::QScheme, + c10::ScalarType, + c10::Device, + c10::DeviceIndex, + c10::Layout, + c10::MemoryFormat, + at::Dimname>; + +// We have an unboxed functor in hand that takes C++ arguments, and +// we're building a boxed functor wrapper for it that takes IValues. +// So "outside" is boxed and "inside" is unboxed. +// +// So a valid input type is one that our boxed functor wrapper can +// unbox from an IValue into a C++ value. +// +// Whereas a valid output type is one that our wrapper can receive +// as a C++ value from the unboxed functor, and box into an IValue. + +// +// assert_is_valid_input_type +// checks that T can be unboxed from an IValue into a C++ value. +// + +template +struct assert_is_valid_input_type { + assert_is_valid_input_type() { + if constexpr (guts::typelist::contains:: + value) { + /* everything is ok, this is a primitive type */ + } else { + /* otherwise this must be an instance of a valid custom class, since it + can only have been created via IValue(x), which ensures this. */ + } + } +}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type {}; + +template +struct TypeCheckHelper; + +template +struct TypeCheckHelper {}; + +template +struct TypeCheckHelper + : TypeCheckHelper { + assert_is_valid_input_type check; +}; + +template +struct assert_is_valid_input_type< + std::tuple, + AllowDeprecatedTypes> + : TypeCheckHelper {}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + guts::typelist::contains::value, + "You tried to register a kernel with an unsupported input type: Dict where Key is invalid. We only support int64_t, double, bool, and string."); +}; + +template +struct assert_is_valid_input_type< + std::unordered_map, + AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + AllowDeprecatedTypes, + "You tried to register a kernel with an unsupported input type: std::unordered_map. Please use Dict instead."); + static_assert( + guts::typelist::contains::value, + "You tried to register a kernel with an unsupported input type: std::unordered_map where Key is invalid. We only support int64_t, double, bool, and string."); +}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported input type: List. Please use List, List or Tensor instead."); +}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported input type: ArrayRef. Please use List, List or Tensor instead."); +}; + +template +struct assert_is_valid_input_type< + c10::OptionalArrayRef, + AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported input type: OptionalArrayRef. Please use List, List or Tensor instead."); +}; + +template +struct assert_is_valid_input_type, AllowDeprecatedTypes> + : assert_is_valid_input_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported input type: std::array. Please use std::array instead."); +}; + +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + // There is no reason to support float when we have double. Keep the API lean. + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported input type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string."); +}; +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported input type: const char*. Please use std::string_view instead."); +}; +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t, T>>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported input type: vector. Please use List instead."); +}; +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t< + std::is_integral_v && + !guts::typelist::contains::value>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string."); +}; +template +struct assert_is_valid_input_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel taking c10::SymInt by reference. Please accept it by value instead."); +}; + +// TODO: it probably would be good to tighten this up quite a bit more with +// an explicit list for everything + +// +// assert_is_valid_output_type +// + +template +struct assert_is_valid_output_type { + assert_is_valid_output_type() { + if constexpr (guts::typelist::contains:: + value) { + /* everything is ok, this is a primitive type */ + } else { + /* otherwise T is verified to be a registered custom class in the IValue + constructor, so no benefit in double-checking here */ + } + } +}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type {}; + +template +struct assert_is_valid_output_type< + c10::OptionalArrayRef, + AllowDeprecatedTypes> + : assert_is_valid_output_type {}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + guts::typelist::contains::value, + "You tried to register a kernel with an unsupported output type: Dict where Key is invalid. We only support int64_t, double, bool, and string."); + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: Dict. Please use Dict or Dict."); +}; + +template +struct assert_is_valid_output_type< + std::unordered_map, + AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + AllowDeprecatedTypes, + "You tried to register a kernel with an unsupported output type: std::unordered_map. Please use Dict instead."); + static_assert( + guts::typelist::contains::value, + "You tried to register a kernel with an unsupported output type: std::unordered_map where Key is invalid. We only support int64_t, double, bool, and string."); + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: std::unordered_map. Please use Dict or Dict."); +}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: List. Please use List, List or Tensor instead."); +}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: std::vector. Please use List, List or Tensor instead."); + // TODO static_assert(AllowDeprecatedTypes, "You tried to register a kernel + // with an unsupported output type: std::vector. Please use List + // instead."); +}; + +template +struct assert_is_valid_output_type, AllowDeprecatedTypes> + : assert_is_valid_output_type { + static_assert( + !std::is_same_v, + "You tried to register a kernel with an unsupported output type: std::array. Please use std::array instead."); +}; + +// The following specialisations of assert_is_valid_output_type are technically +// not necessary since we would hit the base case and show an error message +// there if they didn't exist, but we can show a better error message +// in some common error scenarios. +template +struct assert_is_valid_output_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + // There is no reason to support float when we have double. Keep the API lean. + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported output type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string."); +}; +template +struct assert_is_valid_output_type< + T, + AllowDeprecatedTypes, + std::enable_if_t>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported output type: const char*. Please use std::string_view instead."); +}; +template +struct assert_is_valid_output_type< + T, + AllowDeprecatedTypes, + std::enable_if_t, T>>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported output type: vector. Please use List instead."); +}; +template +struct assert_is_valid_output_type< + T, + AllowDeprecatedTypes, + std::enable_if_t< + std::is_integral_v && + !guts::typelist::contains::value>> { + static_assert( + guts::false_t::value, + "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string."); +}; + +// ivalue_to_arg + +template +struct decay_if_not_tensor final { + using type = std::decay_t; +}; + +template <> +struct decay_if_not_tensor final { + using type = at::Tensor&; +}; + +template <> +struct decay_if_not_tensor final { + using type = const at::Tensor&; +}; + +template +struct ivalue_to_arg final { + static decltype(auto) call(IValue& v) { + assert_is_valid_input_type(); + return std::move(v).to(); + } +}; + +// The following two specializations take advantage of specialized +// `toTensor()` overloads on IValue to avoid copying. +template +struct ivalue_to_arg final { + // We cannot use the default implementation if they asked for a + // `at::Tensor&` because it moves from the IValue, so it can't get + // an lvalue reference. + static at::Tensor& call(IValue& v) { + // Tensor& is valid, don't bother asserting + return v.toTensor(); + } +}; + +template +struct ivalue_to_arg final { + // We should not use the default implementation if they asked for + // a `const at::Tensor&` because it moves from the IValue and they + // didn't ask for that. + static const at::Tensor& call(IValue& v) { + // const Tensor& is valid, don't bother asserting + return v.toTensor(); + } +}; + +template +struct ivalue_to_arg final { + static List call(IValue& v) { + return v.toTensorList(); + } +}; + +template +struct ivalue_to_arg, AllowDeprecatedTypes> final { + // If an argument is ArrayRef, convert the IValue to a std::vector and + // pass that to the operator. std::vector is implicitly convertible to + // ArrayRef. + static std::vector call(IValue& v) { + return ivalue_to_arg, AllowDeprecatedTypes>::call(v); + } +}; +template +struct ivalue_to_arg final { + static std::vector call(IValue& v) { + if (v.isIntList()) { + std::vector r; + auto src = v.toIntList(); + std::transform( + src.begin(), src.end(), std::back_inserter(r), [](int64_t i) { + return c10::SymInt(i); + }); + return r; + } else { + return ivalue_to_arg, AllowDeprecatedTypes>:: + call(v); + } + } +}; +template +struct ivalue_to_arg, AllowDeprecatedTypes> + final { + static OptionalArray call(IValue& v) { + if (v.isIntList()) { + std::vector r; + auto src = v.toIntList(); + std::transform( + src.begin(), src.end(), std::back_inserter(r), [](int64_t i) { + return c10::SymInt(i); + }); + return OptionalArray(std::move(r)); + } else { + return std::move(v).to>(); + } + } +}; +template +struct ivalue_to_arg>, AllowDeprecatedTypes> final { + // If an argument is std::optional>, convert the IValue to an + // std::optional> and pass that to the operator. + // OptionalArray is basically a std::optional> but + // implicitly convertible to std::optional>. + static OptionalArray call(IValue& v) { + return ivalue_to_arg, AllowDeprecatedTypes>::call(v); + } +}; + +template +struct ivalue_to_arg, AllowDeprecatedTypes> final { + // If an argument is OptionalArrayRef, convert the IValue to an + // std::optional> and pass that to the operator. + // OptionalArray is basically a std::optional> but + // implicitly convertible to OptionalArrayRef + static OptionalArray call(IValue& v) { + return ivalue_to_arg, AllowDeprecatedTypes>::call(v); + } +}; + +// return_to_ivalue +template +struct return_to_ivalue final {}; + +template +struct return_to_ivalue< + T, + AllowDeprecatedTypes, + std::enable_if_t>> + final { + static IValue call(T&& v) { + assert_is_valid_output_type(); + return c10::ivalue::from(std::move(v)); + } + static IValue copy(const T& v) { + assert_is_valid_output_type(); + return IValue(v); + } +}; + +// Special case to allow kernels to return `Tensor&`. +// TODO Delete this once kernels don't do that anymore +template +struct return_to_ivalue final { + static IValue call(at::Tensor& v) { + return c10::ivalue::from(v); + } + static IValue copy(at::Tensor& v) { + return IValue(v); + } +}; + +// wrap_kernel_functor_unboxed_ + +template +struct wrap_kernel_functor_unboxed_ final {}; + +// This specialization is for kernels with a first argument that is NOT of type +// DispatchKeySet This includes kernels with 0 arguments. +template +struct wrap_kernel_functor_unboxed_< + KernelFunctor, + ReturnType(ParameterTypes...)> + final { + static_assert( + std::is_same_v< + ReturnType, + typename guts::infer_function_traits_t::return_type>, + "Return type mismatch"); + static_assert( + std::is_same_v< + guts::typelist::typelist, + typename guts::infer_function_traits_t< + KernelFunctor>::parameter_types>, + "Parameter types mismatch"); + + // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes + // doesn't use && + static ReturnType call( + OperatorKernel* functor, + DispatchKeySet /*unused*/, + ParameterTypes... args) { + KernelFunctor* functor_ = static_cast(functor); + // Note [Plumbing Keys Through The Dispatcher 2] + // See Note [Plumbing Keys Through The Dispatcher] for the background. + // This functor explicitly takes in a dispatchKeySet and drops it on the + // floor- it does not forward it to the registered kernel. + // + // This is due to the calling convention within the dispatcher, which + // expects all registered kernels to have a first argument of type + // DispatchKeySet. + // This is not the case for pretty much all manually written kernels, + // however- this functor serves to separate the calling convention of the + // dispatcher from the calling convention of manually written kernels. + return (*functor_)(std::forward(args)...); + } +}; + +// This specialization is for kernels with a first argument of type +// DispatchKeySet +template +struct wrap_kernel_functor_unboxed_< + KernelFunctor, + ReturnType(DispatchKeySet, ParameterTypes...)> + final { + static_assert( + std::is_same_v< + ReturnType, + typename guts::infer_function_traits_t::return_type>, + "Return type mismatch"); + static_assert( + std::is_same_v< + guts::typelist::typelist, + typename guts::infer_function_traits_t< + KernelFunctor>::parameter_types>, + "Parameter types mismatch"); + + // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes + // doesn't use && + static ReturnType call( + OperatorKernel* functor, + DispatchKeySet dispatchKeySet, + ParameterTypes... args) { + KernelFunctor* functor_ = static_cast(functor); + // We're explicitly taking in a dispatchKeySet and forwarding it to the + // registered kernel. See Note [Plumbing Keys Through The Dispatcher 2] for + // details. + return (*functor_)(dispatchKeySet, std::forward(args)...); + } +}; + +template +using wrap_kernel_functor_unboxed = wrap_kernel_functor_unboxed_< + KernelFunctor, + typename guts::infer_function_traits_t::func_type>; + +// call_functor_with_args_from_stack + +template < + class Functor, + bool AllowDeprecatedTypes, + size_t... ivalue_arg_indices, + typename... ArgTypes> +std::decay_t::return_type> +call_functor_with_args_from_stack_( + OperatorKernel* functor, + DispatchKeySet dispatchKeySet, + Stack* stack, + std::index_sequence /*unused*/, + guts::typelist::typelist* /*unused*/) { + (void)stack; // when sizeof...(ivalue_arg_indices) == 0, this argument would + // be unused and we have to silence the compiler warning. + + // We're explicitly filtering out DispatchKeySet from the argument list. + // Some kernels take a DispatchKeySet as their first argument in order to + // plumb keys through the dispatcher. We don't want to expose the + // DispatchKeySet type to jit, so we don't include this argument on the stack. + // See Note [Plumbing Keys Through The Dispatcher] for the background. + return wrap_kernel_functor_unboxed::call( + functor, + dispatchKeySet, + ivalue_to_arg< + typename decay_if_not_tensor::type, + AllowDeprecatedTypes>:: + call(torch::jit::peek( + *stack, ivalue_arg_indices, sizeof...(ivalue_arg_indices)))...); +} + +template +std::decay_t::return_type> +call_functor_with_args_from_stack( + OperatorKernel* functor, + DispatchKeySet dispatchKeySet, + Stack* stack) { + // We're explicitly filtering out DispatchKeySet from the argument list. + // Some kernels take a DispatchKeySet as their first argument in order to + // plumb keys through the dispatcher. We don't want to expose the + // DispatchKeySet type to jit, so we don't include this argument on the stack. + // See Note [Plumbing Keys Through The Dispatcher] for the background. + using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func< + Functor>::parameter_types; + constexpr size_t num_ivalue_args = guts::typelist::size::value; + return call_functor_with_args_from_stack_( + functor, + dispatchKeySet, + stack, + std::make_index_sequence(), + static_cast(nullptr)); +} + +// push_outputs + +template +struct push_outputs final { + // Contrary to [Note: Argument forwarding in the dispatcher], we use + // OutputType&& here to avoid one extra call to the move constructor in this + // case. This is still not a universal reference though because OutputType is + // an explicitly specified class template parameter. + static void call(OutputType&& output, Stack* stack) { + torch::jit::push( + *stack, + return_to_ivalue::call( + std::forward(output))); + } + static void copy(const OutputType& output, Stack* stack) { + torch::jit::push( + *stack, + return_to_ivalue::copy(output)); + } +}; +template +struct push_outputs, AllowDeprecatedTypes> final { + static void call(std::tuple&& output, Stack* stack) { + call_( + std::move(output), + stack, + std::make_index_sequence()); + } + static void copy(const std::tuple& output, Stack* stack) { + copy_(output, stack, std::make_index_sequence()); + } + + private: + template + static void call_( + std::tuple&& output, + Stack* stack, + std::index_sequence /*unused*/) { + torch::jit::push( + *stack, + return_to_ivalue::call( + std::forward(std::get(output)))...); + } + template + static void copy_( + const std::tuple& output, + Stack* stack, + std::index_sequence /*unused*/) { + torch::jit::push( + *stack, + return_to_ivalue::copy( + std::get(output))...); + } +}; +template +struct push_outputs final { + static void call(int /*dummy*/, Stack* /*stack*/) {} + static void copy(int /*dummy*/, Stack* /*stack*/) {} +}; + +// make_boxed_from_unboxed_functor + +template +struct make_boxed_from_unboxed_functor final { + static_assert( + std::is_base_of_v, + "Tried to register a kernel functor using the kernel() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + + static void call( + OperatorKernel* functor, + const OperatorHandle& /*unused*/, + DispatchKeySet dispatchKeySet, + Stack* stack) { + using ReturnType = + typename guts::infer_function_traits_t::return_type; + // We're explicitly filtering out DispatchKeySet from the argument list. + // Some kernels take a DispatchKeySet as their first argument in order to + // plumb keys through the dispatcher. We don't want to expose the + // DispatchKeySet type to jit, so we don't include this argument on the + // stack. See Note [Plumbing Keys Through The Dispatcher] for the + // background. + using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func< + KernelFunctor>::parameter_types; + constexpr bool has_outputs = !std::is_same_v; + constexpr size_t num_inputs = guts::typelist::size::value; + if constexpr (has_outputs) { + // Decay ReturnType to ReturnType_ so that if a reference gets returned, + // we actually store it by value and don't get a dangling reference. This + // is only required because some kernels still return `Tensor&`. [Note: + // VC++ and 'std': ambiguous symbol] + using ReturnType_ = ::std::decay_t; + ReturnType_ output = call_functor_with_args_from_stack< + KernelFunctor, + AllowDeprecatedTypes>(functor, dispatchKeySet, stack); + torch::jit::drop(*stack, num_inputs); + // See note [ VC++ and 'std': ambiguous symbol] + push_outputs::call( + ::std::move(output), stack); + } else { + call_functor_with_args_from_stack( + functor, dispatchKeySet, stack); + torch::jit::drop(*stack, num_inputs); + } + } +}; +} // namespace impl + +} // namespace c10 + +namespace torch { +using OperatorKernel = c10::OperatorKernel; +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..aecf24471b02853caed9872783e3fdb3f3aaf011 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h @@ -0,0 +1,145 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +template +inline std::vector makeStack(Inputs&&... inputs) { + return {std::forward(inputs)...}; +} + +inline at::Tensor dummyTensor( + c10::DispatchKeySet ks, + bool requires_grad = false) { + auto* allocator = c10::GetCPUAllocator(); + int64_t nelements = 1; + auto dtype = caffe2::TypeMeta::Make(); + int64_t size_bytes = nelements * dtype.itemsize(); + auto storage_impl = c10::make_intrusive( + c10::StorageImpl::use_byte_size_t(), + size_bytes, + allocator->allocate(size_bytes), + allocator, + /*resizable=*/true); + at::Tensor t = + at::detail::make_tensor(storage_impl, ks, dtype); + // TODO: We add this to simulate the ideal case where we only have Autograd + // backend keys + // on Tensor when it requires grad. But currently Autograd keys are + // added in TensorImpl constructor by default. + if (!requires_grad) { + t.unsafeGetTensorImpl()->remove_autograd_key(); + } + return t; +} + +inline at::Tensor dummyTensor( + c10::DispatchKey dispatch_key, + bool requires_grad = false) { + return dummyTensor(c10::DispatchKeySet(dispatch_key), requires_grad); +} + +template +inline std::vector callOp( + const c10::OperatorHandle& op, + Args... args) { + auto stack = makeStack(std::forward(args)...); + op.callBoxed(&stack); + return stack; +} + +template +inline Result callOpUnboxed(const c10::OperatorHandle& op, Args... args) { + return op.typed().call(std::forward(args)...); +} + +template +inline Result callOpUnboxedWithDispatchKey( + const c10::OperatorHandle& op, + c10::DispatchKey dispatchKey, + Args... args) { + return op.typed().callWithDispatchKey( + dispatchKey, std::forward(args)...); +} + +template +inline Result callOpUnboxedWithPrecomputedDispatchKeySet( + const c10::OperatorHandle& op, + c10::DispatchKeySet ks, + Args... args) { + return op.typed().redispatch( + ks, std::forward(args)...); +} + +inline void expectDoesntFindKernel( + const char* op_name, + c10::DispatchKey dispatch_key) { + auto op = c10::Dispatcher::singleton().findSchema({op_name, ""}); + EXPECT_ANY_THROW(callOp(*op, dummyTensor(dispatch_key), 5);); +} + +inline void expectDoesntFindOperator(const char* op_name) { + auto op = c10::Dispatcher::singleton().findSchema({op_name, ""}); + EXPECT_FALSE(op.has_value()); +} + +template +inline void expectThrows(Functor&& functor, const char* expectMessageContains) { + try { + std::forward(functor)(); + } catch (const Exception& e) { + EXPECT_THAT(e.what(), testing::HasSubstr(expectMessageContains)); + return; + } + ADD_FAILURE() << "Expected to throw exception containing \"" + << expectMessageContains << "\" but didn't throw"; +} + +template +void expectListEquals(c10::ArrayRef expected, std::array actual) { + EXPECT_EQ(expected.size(), actual.size()); + for (const auto i : c10::irange(expected.size())) { + EXPECT_EQ(expected[i], actual[i]); + } +} + +template +void expectListEquals(c10::ArrayRef expected, c10::ArrayRef actual) { + EXPECT_EQ(expected.size(), actual.size()); + for (const auto i : c10::irange(expected.size())) { + EXPECT_EQ(expected[i], actual[i]); + } +} + +template +void expectListEquals(c10::ArrayRef expected, c10::List actual) { + EXPECT_EQ(expected.size(), actual.size()); + for (const auto i : c10::irange(expected.size())) { + EXPECT_EQ(expected[i], actual.get(i)); + } +} + +template +void expectListEquals(c10::ArrayRef expected, std::vector actual) { + EXPECT_EQ(expected.size(), actual.size()); + for (const auto i : c10::irange(expected.size())) { + EXPECT_EQ(expected[i], actual[i]); + } +} + +// NB: This is not really sound, but all of the type sets constructed here +// are singletons so it's fine +static inline c10::DispatchKey extractDispatchKey(const at::Tensor& t) { + return legacyExtractDispatchKey(t.key_set()); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h new file mode 100644 index 0000000000000000000000000000000000000000..6812e6c1dc0d6656f3522fa1832a90101d4d80e7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h @@ -0,0 +1,72 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace c10::impl { + +// A CppSignature object holds RTTI information about a C++ function signature +// at runtime and can compare them or get a debug-printable name. +class TORCH_API CppSignature final { + public: + CppSignature(const CppSignature&) = default; + CppSignature(CppSignature&&) noexcept = default; + CppSignature& operator=(const CppSignature&) = default; + CppSignature& operator=(CppSignature&&) noexcept = default; + + template + static CppSignature make() { + // Normalize functors, lambdas, function pointers, etc. into the plain + // function type The first argument of the schema might be of type + // DispatchKeySet, in which case we remove it. We do this to guarantee that + // all CppSignature's for an operator will match, even if they're registered + // with different calling conventions. + // See Note [Plumbing Keys Through The Dispatcher] + using decayed_function_type = + typename c10::remove_DispatchKeySet_arg_from_func< + std::decay_t>::func_type; + + return CppSignature(std::type_index(typeid(decayed_function_type))); + } + + std::string name() const { + return c10::demangle(signature_.name()); + } + + friend bool operator==(const CppSignature& lhs, const CppSignature& rhs) { + if (lhs.signature_ == rhs.signature_) { + return true; + } + // Without RTLD_GLOBAL, the type_index comparison could yield false because + // they point to different instances of the RTTI data, but the types would + // still be the same. Let's check for that case too. + // Note that there still is a case where this might not work, i.e. when + // linking libraries of different compilers together, they might have + // different ways to serialize a type name. That, together with a missing + // RTLD_GLOBAL, would still fail this. + if (0 == strcmp(lhs.signature_.name(), rhs.signature_.name())) { + return true; + } + + return false; + } + + private: + explicit CppSignature(std::type_index signature) + : signature_(std::move(signature)) {} + std::type_index signature_; +}; + +inline bool operator!=(const CppSignature& lhs, const CppSignature& rhs) { + return !(lhs == rhs); +} + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h new file mode 100644 index 0000000000000000000000000000000000000000..78b8cecac1db5d571ec9fb88de9f294505f4b271 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h @@ -0,0 +1,285 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +namespace impl { + +// Take a DispatchKeySet for a Tensor and determine what the actual dispatch +// DispatchKey should be, taking into account TLS, and skipping backends which +// fall through. +// +// Unlike Tensor::key_set(), the value of this on a tensor can change depending +// on TLS. +// +// NB: If there is no valid dispatch key, this will return Undefined +inline DispatchKeySet computeDispatchKeySet( + DispatchKeySet ks, + // The key mask lets us eliminate (by zero entries) keys which should not + // be considered for dispatch. There are two cases when we use this: + // + // - If an operator's dispatch table contains a fallthrough entry, we + // should bypass it entirely when finding the key + // - If a user invokes with redispatch, the mask lets us + // zero out the key the user asked us to stop. + // + // These excluded backends are NOT tracked in the TLS, but must be applied + // AFTER TLS (since the backend may have been introduced for consideration + // by the included TLS), which is why you have to pass them in to this + // function (as opposed to just applying it to the input 'ks'). + DispatchKeySet key_mask) { + c10::impl::LocalDispatchKeySet local = + c10::impl::tls_local_dispatch_key_set(); + // TODO: It's a bit irritating that we have to do logical ORs here, it would + // be nice to only do one. Can always_included be folded into the TLS? Well, + // it's a bit troublesome, because fastpath TLS access requires the type of + // the TLS in question to be zero-initialized, so you don't actually win + // anything in that case. + return (((ks | local.included_) - local.excluded_) & key_mask); +} + +} // namespace impl + +namespace detail { +// A small gadget to extract the DispatchKeySet from types which are known +// to have it. Used to extract dispatch keys from unboxed calls. +struct MultiDispatchKeySet : at::IterArgs { + DispatchKeySet ts; + void operator()(const at::Tensor& x) { + ts = ts | x.key_set(); + } + void operator()(const std::optional& x) { + if (x.has_value()) { + ts = ts | x->key_set(); + } + } + void operator()(at::ArrayRef xs) { + for (const auto& x : xs) { + ts = ts | x.key_set(); + } + } + // Tensor?[] translates to this case. + void operator()(const c10::List>& xs) { + for (std::optional x : xs) { + if (x.has_value()) { + ts = ts | x.value().key_set(); + } + } + } + // Structured Tensor[] translates to this case + void operator()(const at::ITensorListRef& xs) { + for (const auto& x : xs) { + ts = ts | x.key_set(); + } + } + [[noreturn]] void operator()( + at::ArrayRef> /*unused*/) { + // Just checking that the handling of Tensor?[] didn't change. + TORCH_INTERNAL_ASSERT(false); + } + void operator()(const at::Generator& gen) { + if (gen.defined()) { + ts = ts | gen.key_set(); + } + } + void operator()(const std::optional& gen) { + if (gen.has_value() && gen->defined()) { + ts = ts | gen->key_set(); + } + } + template + void operator()(const T& /*unused*/) { + // do nothing + } +}; + +// NB: take by const reference (Don't do universal forwarding here! You +// don't want to move into this function!) +template +DispatchKeySet multi_dispatch_key_set(const Args&... args) { + return MultiDispatchKeySet().apply(args...).ts; +} +} // namespace detail + +/** + * An instance of DispatchKeyExtractor knows how to get a dispatch key given + * a list of arguments for an operator call. + * + * The instance is specific for a certain operator as: + * - In boxed dispatch, different operators have different ways to extract + * the dispatch key (e.g. different numbers of arguments), and we precompute + * the stack locations we should look at; and + * - In all dispatch, some backends should be excluded from dispatch because + * they have been registered as fallthrough. The set of excluded backends + * varies from operator, as some operators may have overridden the + * fallthrough with custom behavior. + * + * Note - this should maintain identical impl to the py dispatcher key + * extraction logic at pytorch/torch/dispatcher.py + */ +struct TORCH_API DispatchKeyExtractor final { + public: + static DispatchKeyExtractor make(const FunctionSchema& schema) { + return DispatchKeyExtractor(makeBitsetForDispatchArgs(schema)); + } + + static DispatchKeyExtractor makeUninitialized() { + return DispatchKeyExtractor(c10::utils::bitset()); + } + + void registerSchema(const FunctionSchema& schema) { + TORCH_INTERNAL_ASSERT(dispatch_arg_indices_reverse_.is_entirely_unset()); + dispatch_arg_indices_reverse_ = makeBitsetForDispatchArgs(schema); + } + void deregisterSchema() { + dispatch_arg_indices_reverse_ = c10::utils::bitset(); + } + + DispatchKeySet getDispatchKeySetBoxed(const torch::jit::Stack* stack) const { + DispatchKeySet ks; + dispatch_arg_indices_reverse_.for_each_set_bit([&](size_t + reverse_arg_index) { + const auto& ivalue = torch::jit::peek(*stack, 0, reverse_arg_index + 1); + if (C10_LIKELY(ivalue.isTensor())) { + // NB: Take care not to introduce a refcount bump (there's + // no safe toTensorRef method, alas) + ks = ks | ivalue.unsafeToTensorImpl()->key_set(); + } else if (C10_UNLIKELY(ivalue.isTensorList())) { + // NB: use toListRef as it doesn't induce refcount bumps + // (toTensorListRef is not a thing) + for (const auto& nv : ivalue.toListRef()) { + auto* tensor = nv.unsafeToTensorImpl(); + ks = ks | tensor->key_set(); + } + } + // Tensor?[] translates to a c10::List so we need to peek inside + else if (C10_UNLIKELY(ivalue.isList())) { + for (const auto& elt : ivalue.toListRef()) { + if (elt.isTensor()) { + ks = ks | elt.toTensor().key_set(); + } + } + } + }); + // Keys that are fallthrough should be skipped + if (requiresBitsetPerBackend_) { + c10::impl::LocalDispatchKeySet tls = + c10::impl::tls_local_dispatch_key_set(); + auto backend_idx = + ((ks | tls.included_) - tls.excluded_).getBackendIndex(); + return impl::computeDispatchKeySet( + ks, nonFallthroughKeysPerBackend_[backend_idx]); + } else { + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + } + } + + template + DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const { + auto ks = detail::multi_dispatch_key_set(args...); + // Keys that are fallthrough should be skipped + if (requiresBitsetPerBackend_) { + c10::impl::LocalDispatchKeySet tls = + c10::impl::tls_local_dispatch_key_set(); + auto backend_idx = + ((ks | tls.included_) - tls.excluded_).getBackendIndex(); + return impl::computeDispatchKeySet( + ks, nonFallthroughKeysPerBackend_[backend_idx]); + } else { + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + } + } + + void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough); + + std::string dumpState() const; + void checkInvariants(const FunctionSchema& schema) const; + + private: + static bool isDispatchType(const Type& type) { + // Checking isSubtypeOf on a DynamicType heap-allocates a + // DynamicType version of the argument if it's not a DynamicType + // already, and this has measurable overhead during startup. +#ifdef C10_MOBILE + struct CachedTypes { + DynamicTypePtr listOfTensors; + DynamicTypePtr listOfOptionalTensors; + DynamicTypePtr optionalOfTensor; + }; + static const CachedTypes ct = { + DynamicType::create(*ListType::ofTensors()), + DynamicType::create(*ListType::ofOptionalTensors()), + DynamicType::create(*OptionalType::ofTensor())}; + return type.isSubtypeOf(c10::TypeFactory::get()) || + type.isSubtypeOf(ct.listOfTensors) || + type.isSubtypeOf(ct.listOfOptionalTensors) || + type.isSubtypeOf(ct.optionalOfTensor); +#else // C10_MOBILE + return type.isSubtypeOf(*TensorType::get()) || + type.isSubtypeOf(*ListType::ofTensors()) || + type.isSubtypeOf(*ListType::ofOptionalTensors()) || + type.isSubtypeOf(*OptionalType::ofTensor()); +#endif // C10_MOBILE + } + static c10::utils::bitset makeBitsetForDispatchArgs( + const FunctionSchema& schema) { + TORCH_CHECK( + schema.arguments().size() <= c10::utils::bitset::NUM_BITS(), + "The function schema has ", + schema.arguments().size(), + " arguments but this PyTorch build only supports ", + c10::utils::bitset::NUM_BITS()); + c10::utils::bitset dispatch_arg_indices_reverse; + for (const auto index : c10::irange(schema.arguments().size())) { + if (isDispatchType(*schema.arguments()[index].type())) { + dispatch_arg_indices_reverse.set(schema.arguments().size() - 1 - index); + } + } + return dispatch_arg_indices_reverse; + } + + explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse) + : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse), + nonFallthroughKeys_(DispatchKeySet::FULL) { + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { + nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL; + } + } + + // this is a bitset that has ones for each argument index which has to be + // considered for dispatch. This avoids having to iterate over the stack + // to find all the tensors. The bits are stored in reverse order, i.e. + // dispatch_arg_indices_reverse_[i] == true, then the i-th argument from + // the top of the stack (i.e. the i-th last argument of the function) + // is relevant for dispatch. + // dispatch_arg_indices_reverse_ is allowed to have zero bits set; that just + // means you must do the fallthrough + c10::utils::bitset dispatch_arg_indices_reverse_; + + // Set of functionality keys for which the operator does NOT have fallthrough + // kernel. + DispatchKeySet nonFallthroughKeys_; + // Set of functionality keys for which the operator does NOT have fallthrough + // kernel, defined PER BACKEND. This is only needed if we know that the + // operator has a different set of fallthroughs defined for some backends. + std::array nonFallthroughKeysPerBackend_; + // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast + // path), or if we need to fall back to the slower path and check + // nonFallthroughKeysPerBackend_ + bool requiresBitsetPerBackend_{false}; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h new file mode 100644 index 0000000000000000000000000000000000000000..2dc51027a01bb6fa8e83c3542d06e3c1008a4db5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h @@ -0,0 +1,955 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifndef NDEBUG +#include +#endif + +namespace c10 { + +TORCH_API bool show_dispatch_trace(); +TORCH_API void dispatch_trace_nesting_incr(); +TORCH_API void dispatch_trace_nesting_decr(); +TORCH_API int64_t dispatch_trace_nesting_value(); + +struct DispatchTraceNestingGuard { + DispatchTraceNestingGuard() { + dispatch_trace_nesting_incr(); + } + ~DispatchTraceNestingGuard() { + dispatch_trace_nesting_decr(); + } +}; + +class TORCH_API OperatorHandle; +template +class TypedOperatorHandle; + +/** + * Implement this interface and register your instance with the dispatcher + * to get notified when operators are registered or deregistered with + * the dispatcher. + * + * NB: registration events only occur when a 'def' occurs; we don't trigger + * on 'impl' or 'fallback' calls. + */ +class TORCH_API OpRegistrationListener { + public: + virtual ~OpRegistrationListener(); + + virtual void onOperatorRegistered(const OperatorHandle& op) = 0; + virtual void onOperatorDeregistered(const OperatorHandle& op) = 0; +}; + +namespace detail { +class RegistrationListenerList; +} +class SchemaRegistrationHandleRAII; + +/** + * Top-level dispatch interface for dispatching via the dynamic dispatcher. + * Most end users shouldn't use this directly; if you're trying to register + * ops look in op_registration + */ +class TORCH_API Dispatcher final { + private: + // For direct access to backend fallback information + friend class impl::OperatorEntry; + + struct OperatorDef final { + explicit OperatorDef(OperatorName&& op_name) : op(std::move(op_name)) {} + + impl::OperatorEntry op; + + // These refer to the number of outstanding RegistrationHandleRAII + // for this operator. def_count reflects only def() registrations + // (in the new world, this should only ever be 1, but old style + // registrations may register the schema multiple times, which + // will increase this count). def_and_impl_count reflects the number + // of combined def() and impl() registrations. When the last def() gets + // unregistered, we must immediately call the Deregistered listeners, but we + // must not actually delete the handle as there are other outstanding RAII + // destructors which will try to destruct and they had better still have a + // working operator handle in this case + size_t def_count = 0; + size_t def_and_impl_count = 0; + }; + friend class OperatorHandle; + template + friend class TypedOperatorHandle; + + struct Guard final { + Guard() : alive(true) {} + std::atomic alive; + std::mutex mutex; + }; + + public: + ~Dispatcher(); + + // Implementation note: this class abstracts over the fact that we have + // per-operator dispatch tables. This could be easily adjusted to have a + // single global hash table. + static Dispatcher& realSingleton(); + + C10_ALWAYS_INLINE static Dispatcher& singleton() { +#if !defined C10_MOBILE + // Implemented inline so that steady-state code needn't incur + // function-call overhead. We can't just inline `realSingleton` + // because the function-local static would get duplicated across + // all DSOs that include & use this header, leading to multiple + // singleton instances. + static Dispatcher& s = realSingleton(); + return s; +#else + // For C10_MOBILE, we should never inline a static function that + // has a static member, since the generated code calls + // __cxa_guard_acquire and __cxa_guard_release which help + // implement exactly once semantics for the initialization of the + // static Dispatcher& s above (for the non-mobile case). That + // additional code when duplicated across all operator stubs + // for every backend results in a lot of additional code + // being generated by the compiler. + return realSingleton(); +#endif + } + + // ------------------------------------------------------------------------ + // + // Accessing operators by schema + // + // ------------------------------------------------------------------------ + + /** + * Looks for an operator schema with the given name and overload name + * and returns it if it is registered WITH A SCHEMA. + * Returns nullopt otherwise. + */ + std::optional findSchema(const OperatorName& operator_name); + + /** + * Variant of findSchema that results in less code generated at the call site. + * It (1) takes const char* pointer rather than OperatorName (so we skip + * generating std::string constructor calls at the call site), and (2) + * it raises an exception if the operator is not found (so we skip + * generating exception raising code at the call site) + * + * Irritatingly, we still have to generate the handful of instructions + * for dealing with an exception being thrown during static initialization + * (e.g. __cxa_guard_abort). If we could annotate this method noexcept we + * could avoid this code too, but as the name of the function suggests, + * it does throw exceptions. + */ + OperatorHandle findSchemaOrThrow(const char* name, const char* overload_name); + + // Like findSchema, but also returns OperatorHandle even if there is no schema + std::optional findOp(const OperatorName& operator_name); + + // Returns a list of all operator names present in the operatorLookupTable_ + const std::vector getAllOpNames(); + + // Returns a list of all operator names present in the operatorLookupTable_ + // for a given dispatch key + const std::vector getAllOpNamesForDispatchKey(DispatchKey k); + + // ------------------------------------------------------------------------ + // + // Invoking operators + // + // ------------------------------------------------------------------------ + + template + Return call(const TypedOperatorHandle& op, Args... args) + const; + + template + static Return callWithDispatchKeySlowPath( + const TypedOperatorHandle& op, + at::StepCallbacks& stepCallbacks, + DispatchKeySet dispatchKeySet, + const KernelFunction& kernel, + Args... args); + + // Like call, but intended for use in a redispatch in kernels that have + // explicitly performed the DispatchKey update calculatulation. This will take + // the DispatchKeySet completely as is and dispatch to the kernel of the + // corresponding highest priority key in the set. Note that this version of + // redispatch treats the inputted DispatchKeySet *as is*, and does NOT mask + // out the highest priority key. See Note [Plumbing Keys Through The + // Dispatcher] + template + Return redispatch( + const TypedOperatorHandle& op, + DispatchKeySet currentDispatchKeySet, + Args... args) const; + + // Invoke an operator via the boxed calling convention using an IValue stack + void callBoxed(const OperatorHandle& op, Stack* stack) const; + void callBoxedForDispatchKey( + const OperatorHandle& op, + DispatchKey dk, + Stack* stack) const; + + // TODO: This will only be useful if we write a backend fallback that plumbs + // dispatch keys (currently there are none) See Note [Plumbing Keys Through + // The Dispatcher] + void redispatchBoxed( + const OperatorHandle& op, + DispatchKeySet dispatchKeySet, + Stack* stack) const; + + bool hasBackendFallbackForDispatchKey(DispatchKey dk) { + auto dispatch_ix = getDispatchTableIndexForDispatchKey(dk); + if (dispatch_ix < 0) + return false; + return backendFallbackKernels_[dispatch_ix].kernel.isValid(); + } + + // Used by torchdeploy/multipy for multiple // codespell:ignore: multipy + // interpreters racing. + void waitForDef(const FunctionSchema& schema); + void waitForImpl( + const OperatorName& op_name, + std::optional dispatch_key); + + // ------------------------------------------------------------------------ + // + // Performing registrations (NON user public; use op_registration) + // + // ------------------------------------------------------------------------ + + /** + * Register a new operator schema. + * + * If a schema with the same operator name and overload name already exists, + * this function will check that both schemas are exactly identical. + */ + RegistrationHandleRAII registerDef( + FunctionSchema schema, + std::string debug, + std::vector tags = {}); + + /** + * Register a kernel to the dispatch table for an operator. + * If dispatch_key is nullopt, then this registers a fallback kernel. + * + * @return A RAII object that manages the lifetime of the registration. + * Once that object is destructed, the kernel will be deregistered. + */ + // NB: steals the inferred function schema, as we may need to hold on to + // it for a bit until the real schema turns up + RegistrationHandleRAII registerImpl( + OperatorName op_name, + std::optional dispatch_key, + KernelFunction kernel, + std::optional cpp_signature, + std::unique_ptr inferred_function_schema, + std::string debug); + + /** + * Given an operator, tells the Dispatcher that we have implemented a fake + * impl for this op in the given Python module. Call this a "pystub". + */ + RegistrationHandleRAII registerPythonModule( + const OperatorName& op_name, + const char* pymodule, + const char* context); + + /** + * Given an operator, throws if we have a pystub. + */ + void throwIfHasPythonModule(OperatorName op_name); + + std::optional> getPyStub( + OperatorName op_name); + + /** + * Register a new operator by name. + */ + RegistrationHandleRAII registerName(OperatorName op_name); + + /** + * Register a fallback kernel for a backend. + * If an operator is called but there is no concrete kernel for the dispatch + * key of the given operator arguments, it will check if there is such a + * fallback kernel for the given dispatch key and, if yes, call that one. + */ + RegistrationHandleRAII registerFallback( + DispatchKey dispatch_key, + KernelFunction kernel, + std::string debug); + + /** + * Use to register whenever we had a TORCH_LIBRARY declaration in the frontend + * API. These invocations are only permitted once per program, so we raise + * an error if this is called again for the same namespace. + */ + RegistrationHandleRAII registerLibrary(std::string ns, std::string debug); + + // ------------------------------------------------------------------------ + // + // Listeners on registrations + // + // ------------------------------------------------------------------------ + + /** + * Add a listener that gets called whenever a new op is registered or an + * existing op is deregistered. Immediately after registering, this listener + * gets called for all previously registered ops, so it can be used to keep + * track of ops registered with this dispatcher. + */ + RegistrationHandleRAII addRegistrationListener( + std::unique_ptr listener); + + void checkInvariants() const; + + // + // ------------------------------------------------------------------------ + // + // Assertions + // + // ------------------------------------------------------------------------ + + /** + * For testing purposes. + * Returns a list of all operators that were created through calls to + * registerImpl(), without any corresponding calls to registerDef(). After + * static initialization is done this is almost certainly a bug, as the + * created OperatorHandle won't have any schema associated with it and users + * calling the op through the dispatcher won't be able to access it + * + * Note that we cannot enforce this invariant "as we go" during static + * initialization, due to undefined static initialization order- we have no + * guarantees over the order in which .def() and .impl() calls are registered + * in the dispatcher at static initialization time. So this function should + * only be called after static initialization. + */ + std::vector findDanglingImpls() const; + + /** + * Useful for inspecting global Dispatcher registration state. + * Returns the names of all operators with a kernel registered for the + * specified DispatchKey. If no DispatchKey is specified, it returns all + * registered operators. + */ + std::vector getRegistrationsForDispatchKey( + std::optional k) const; + + private: + Dispatcher(); + + static int64_t sequenceNumberForRunningRecordFunction( + DispatchKey dispatchKey, + DispatchKeySet dispatchKeySet); + static void runRecordFunction( + at::RecordFunction& guard, + at::RecordFunction::schema_ref_t schema_ref, + DispatchKey dispatchKey, + DispatchKeySet dispatchKeySet); + static void runRecordFunction( + at::RecordFunction& guard, + at::RecordFunction::schema_ref_t schema_ref, + DispatchKey dispatchKey, + DispatchKeySet dispatchKeySet, + c10::ArrayRef args); + +#ifdef FBCODE_CAFFE2 + static bool profilingOperatorEvents(); + static void fireOpStartUSDT( + at::RecordFunction::schema_ref_t schema_ref, + std::vector& argsAddresses, + std::vector& argsTypes); + static void fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref); +#endif // FBCODE_CAFFE2 + + OperatorHandle findOrRegisterSchema_(FunctionSchema&& schema); + OperatorHandle findOrRegisterName_(const OperatorName& op_name); + + void deregisterDef_(const OperatorHandle& op, const OperatorName& op_name); + void deregisterImpl_( + const OperatorHandle& op, + const OperatorName& op_name, + std::optional dispatch_key, + impl::OperatorEntry::AnnotatedKernelContainerIterator kernel_handle); + void deregisterName_(const OperatorHandle& op, const OperatorName& op_name); + void deregisterFallback_(DispatchKey dispatchKey); + void deregisterLibrary_(const std::string& ns); + void cleanup(const OperatorHandle& op, const OperatorName& op_name); + void checkSchemaCompatibility( + const OperatorHandle& op, + const FunctionSchema& schema, + const std::string& debug); + + std::list operators_; +#if !defined(C10_MOBILE) + LeftRight> + operatorLookupTable_; +#else + RWSafeLeftRightWrapper> + operatorLookupTable_; +#endif + // Map from namespace to debug string (saying, e.g., where the library was + // defined) + ska::flat_hash_map libraries_; + + std::array + backendFallbackKernels_; + + std::unique_ptr listeners_; + + // This condition variable gets notified whenever we add a new def/impl to the + // dispatch table. This is primarily used by multiply/torchdeploy, when + // we have multiple interpreters trying to register to the dispatch table. + // In this situation, whenever the non-primary interpreter would have tried + // to register to the dispatch table, instead it will check to see if the + // expected registration has already been made, and if it hasn't, wait on + // this condition variable to see if it was just racing with the primary + // interpreter. + // + // We expect it to be rare for there to be any waiters on this condition + // variable. This is mostly just to help give better diagnostics if + // something goes horribly wrong + std::condition_variable cond_var_; + + // Protect concurrent access to the dispatcher. We store this in a + // `shared_ptr` as we return callbacks that call back into dispatcher methods, + // and we need to be able to handle and guard against the event when the + // `Dispatcher` has been destroyed before the callbacks fire. + std::shared_ptr guard_; +}; + +/** + * This is a handle to an operator schema registered with the dispatcher. + * This handle can be used to register kernels with the dispatcher or + * to lookup a kernel for a certain set of arguments. + */ +class TORCH_API OperatorHandle { + template + friend struct std::hash; + + public: + OperatorHandle(OperatorHandle&&) noexcept = default; + OperatorHandle& operator=(OperatorHandle&&) noexcept = default; + OperatorHandle(const OperatorHandle&) = default; + OperatorHandle& operator=(const OperatorHandle&) = default; + // NOLINTNEXTLINE(performance-trivially-destructible) + ~OperatorHandle(); + + const OperatorName& operator_name() const { + return operatorDef_->op.operator_name(); + } + + bool hasSchema() const { + return operatorDef_->op.hasSchema(); + } + + const FunctionSchema& schema() const { + return operatorDef_->op.schema(); + } + + const std::string& debug() const { + return operatorDef_->op.debug(); + } + + std::string dumpState() const { + return operatorDef_->op.dumpState(); + } + + bool hasKernelForDispatchKey(DispatchKey k) const { + return operatorDef_->op.hasKernelForDispatchKey(k); + } + + bool isKernelFallthroughKernel(DispatchKey k) const { + return operatorDef_->op.kernelForDispatchKey(k).isFallthrough(); + } + + bool hasKernelForAnyDispatchKey(DispatchKeySet k) const { + return operatorDef_->op.hasKernelForAnyDispatchKey(k); + } + + bool hasComputedKernelForDispatchKey(DispatchKey k) const { + return operatorDef_->op.hasComputedKernelForDispatchKey(k); + } + + SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const { + return operatorDef_->op.getComputedKernelForDispatchKey(k); + } + + std::string dumpComputedTable() const { + return operatorDef_->op.dumpComputedTable(); + } + + void checkInvariants() const { + operatorDef_->op.checkInvariants(); + } + + c10::ArrayRef getTags() const { + return operatorDef_->op.getTags(); + } + + void setReportErrorCallback_(std::unique_ptr callback) { + operatorDef_->op.setReportErrorCallback_(std::move(callback)); + } + + bool hasTag(const at::Tag& tag) const { + for (const auto& tag_ : getTags()) { + if (tag == tag_) { + return true; + } + } + return false; + } + + template + TypedOperatorHandle typed() const { + // NB: This assert is not 100% sound: you can retrieve a typed() operator + // handle prior to ANY C++ signature being registered on the operator + // and the check will say everything is OK (at which point you can then + // smuggle in a kernel that is typed incorrectly). For everything + // in core library this won't happen, because all the static registrations + // will be done by the time a typed() handle is acquired. +#if !defined C10_MOBILE + operatorDef_->op.assertSignatureIsCorrect(); + if (fn_has_symint::value) { + operatorDef_->op.assertSignatureIsCorrect< + typename fn_remove_symint::type>(); + } +#endif + return TypedOperatorHandle(operatorIterator_); + } + + void callBoxed(Stack* stack) const { + c10::Dispatcher::singleton().callBoxed(*this, stack); + } + + void callBoxed(Stack& stack) const { + callBoxed(&stack); + } + + void callBoxedForDispatchKey(DispatchKey dk, Stack& stack) const { + c10::Dispatcher::singleton().callBoxedForDispatchKey(*this, dk, &stack); + } + + void redispatchBoxed(DispatchKeySet ks, Stack* stack) const { + c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack); + } + + template + PyObject* getPythonOp( + c10::impl::PyInterpreter* self_interpreter, + F slow_accessor) const { + return operatorDef_->op.getPythonOp(self_interpreter, slow_accessor); + } + + bool operator==(const OperatorHandle& other) const { + return operatorDef_ == other.operatorDef_; + } + + bool operator!=(const OperatorHandle& other) const { + return operatorDef_ != other.operatorDef_; + } + + private: + explicit OperatorHandle( + std::list::iterator operatorIterator) + : operatorDef_(&*operatorIterator), operatorIterator_(operatorIterator) {} + friend class Dispatcher; + template + friend class TypedOperatorHandle; + + // Storing a direct pointer to the OperatorDef even though we + // already have the iterator saves an instruction in the critical + // dispatch path. The iterator is effectively a + // pointer-to-std::list-node, and (at least in libstdc++'s + // implementation) the element is at an offset 16 bytes from that, + // because the prev/next pointers come first in the list node + // struct. So, an add instruction would be necessary to convert from the + // iterator to an OperatorDef*. + Dispatcher::OperatorDef* operatorDef_; + + // We need to store this iterator in order to make + // Dispatcher::cleanup() fast -- it runs a lot on program + // termination (and presumably library unloading). + std::list::iterator operatorIterator_; +}; + +/** + * This is a handle to an operator schema registered with the dispatcher. + * It holds the same information as an OperatorHandle, but it is templated + * on the operator arguments and allows calling the operator in an + * unboxed way. + */ +template +class TypedOperatorHandle final { + static_assert( + guts::false_t(), + "FuncType in OperatorHandle::typed was not a valid function type"); +}; +template +class TypedOperatorHandle final : public OperatorHandle { + public: + TypedOperatorHandle(TypedOperatorHandle&&) noexcept = default; + TypedOperatorHandle& operator=(TypedOperatorHandle&&) noexcept = default; + TypedOperatorHandle(const TypedOperatorHandle&) = default; + TypedOperatorHandle& operator=(const TypedOperatorHandle&) = default; + + // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use + // && + C10_ALWAYS_INLINE Return call(Args... args) const { + return c10::Dispatcher::singleton().call( + *this, std::forward(args)...); + } + + // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use + // && + C10_ALWAYS_INLINE Return + redispatch(DispatchKeySet currentDispatchKeySet, Args... args) const { + return c10::Dispatcher::singleton().redispatch( + *this, currentDispatchKeySet, std::forward(args)...); + } + + private: + explicit TypedOperatorHandle( + std::list::iterator operatorIterator) + : OperatorHandle(operatorIterator) {} + friend class OperatorHandle; +}; + +namespace detail { +template +inline void unused_arg_(const Args&... /*unused*/) {} + +// CaptureKernelCall is intended to capture return values from Dispatcher +// unboxed kernel calls. A record function may request to get outputs from the +// kernel calls. For boxed kernels, it's straightforward, the returned values +// are in the stack object. The stack can be passed to record functions. For +// unboxed kernels, we need to handle different kinds of return values, cache +// them temporarily, then release the values for the actual function call +// return. +template +struct CaptureKernelCall { + template + CaptureKernelCall( + const F& kernel, + const TypedOperatorHandle& op, + const DispatchKeySet& dispatchKeySet, + Args&&... args) + // Calls the kernel and capture the result in output_. + : output_{kernel.template call( + op, + dispatchKeySet, + std::forward(args)...)} {} + // Wraps the return values in a Stack. + Stack getOutputs() { + Stack stack; + impl::push_outputs::copy(output_, &stack); + return stack; + } + // Since we are returning the output_, we don't expect the output_ to be used + // afterward. Copy elision and RVO do not apply to class data members. Using + // move semantic to avoid copies when possible. + ReturnType release() && { + return std::move(output_); + } + + private: + ReturnType output_; +}; + +// Handle the lvalue reference differently since it should not be moved. +template <> +inline at::Tensor& CaptureKernelCall::release() && { + return output_; +} + +// Handle case where the kernel returns void. +template <> +struct CaptureKernelCall { + template + CaptureKernelCall( + const F& kernel, + const TypedOperatorHandle& op, + const DispatchKeySet& dispatchKeySet, + Args&&... args) { + // Calling the kernel and no need to capture void. + kernel.template call( + op, dispatchKeySet, std::forward(args)...); + } + Stack getOutputs() { + return Stack(); + } + void release() && {} +}; + +TORCH_API void _print_dispatch_trace( + const std::string& label, + const std::string& op_name, + const DispatchKeySet& dispatchKeySet); + +} // namespace detail + +// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use && +template +inline Return Dispatcher::callWithDispatchKeySlowPath( + const TypedOperatorHandle& op, + at::StepCallbacks& stepCallbacks, + DispatchKeySet dispatchKeySet, + const KernelFunction& kernel, + Args... args) { + // If callbacks need inputs, we box the arguments and pass them to the guard. + // Note: For perf reasons we wouldn't want to prematurely box the arguments. + at::RecordFunction guard(std::move(stepCallbacks)); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(op.operatorDef_->op.isObserved()); + auto dispatchKey = dispatchKeySet.highestPriorityTypeId(); + auto& schema = op.schema(); + auto schema_ref = std::reference_wrapper(schema); + constexpr auto num_boxed_args = impl::boxed_size(); + if constexpr (num_boxed_args != 0) { + if (guard.needsInputs()) { + // If we used std::array here, we would + // have to spend time default constructing the IValues in + // boxedArgs. aligned_storage has no such requirement. + // NOLINTNEXTLINE(*array*) + alignas(IValue) std::byte boxedArgs[num_boxed_args * sizeof(IValue)]; + // For debugging only; could be removed (but the compiler will do + // that for us and it's nice to have the extra assurance of + // correctness from our debug builds). + IValue* boxedArgsPtr = reinterpret_cast(boxedArgs); + impl::boxArgsToStack(boxedArgsPtr, args...); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + reinterpret_cast(boxedArgsPtr) == + boxedArgs + num_boxed_args * sizeof(IValue)); + // I don't *think* we need std::launder here, because IValue has + // no subclasses and no const or reference fields. + runRecordFunction( + guard, + schema_ref, + dispatchKey, + dispatchKeySet, + c10::ArrayRef( + reinterpret_cast(boxedArgs), num_boxed_args)); + boxedArgsPtr = reinterpret_cast(boxedArgs); + for (size_t ii = 0; ii < num_boxed_args; ++ii) { + (boxedArgsPtr + ii)->~IValue(); + } + } else { + runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet); + } + } else { + runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet); + } + + if (C10_UNLIKELY(guard.needsOutputs())) { + // Calls the kernel and capture the output temporarily to pass to + // RecordFunction. + detail::CaptureKernelCall captureKernelCall( + kernel, op, dispatchKeySet, std::forward(args)...); + guard.setOutputs(captureKernelCall.getOutputs()); + // Releases the captured output to return to caller. + return std::move(captureKernelCall).release(); + } + + // keeping the guard alive while executing the kernel + return kernel.template call( + op, dispatchKeySet, std::forward(args)...); +} + +// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use && +template +C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call( + const TypedOperatorHandle& op, + Args... args) const { + auto dispatchKeySet = + op.operatorDef_->op.dispatchKeyExtractor() + .template getDispatchKeySetUnboxed(args...); +#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG) + DispatchTraceNestingGuard debug_guard; + if (show_dispatch_trace()) { + detail::_print_dispatch_trace( + "[call]", toString(op.operator_name()), dispatchKeySet); + } +#endif + const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet); +#ifndef PYTORCH_DISABLE_PER_OP_PROFILING + auto step_callbacks = + at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION); + if (C10_UNLIKELY( + step_callbacks.has_value() && op.operatorDef_->op.isObserved())) { + return callWithDispatchKeySlowPath( + op, + *step_callbacks, + dispatchKeySet, + kernel, + std::forward(args)...); + } +#endif // PYTORCH_DISABLE_PER_OP_PROFILING + +#ifdef FBCODE_CAFFE2 + if (profilingOperatorEvents()) { + std::vector argsAddresses = {(void*)(&args)...}; + std::vector argsTypes = {(typeid(args).name())...}; + struct FireOpRAII { + FireOpRAII( + at::RecordFunction::schema_ref_t schema_ref, + std::vector& argsAddresses, + std::vector& argsTypes) + : schema_ref_(schema_ref) { + fireOpStartUSDT(schema_ref, argsAddresses, argsTypes); + } + ~FireOpRAII() { + fireOpEndUSDT(schema_ref_); + } + at::RecordFunction::schema_ref_t schema_ref_; + } event(op.schema(), argsAddresses, argsTypes); + return kernel.template call( + op, dispatchKeySet, std::forward(args)...); + } else { + return kernel.template call( + op, dispatchKeySet, std::forward(args)...); + } +#else + return kernel.template call( + op, dispatchKeySet, std::forward(args)...); +#endif // FBCODE_CAFFE2 +} + +// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use && +template +inline Return Dispatcher::redispatch( + const TypedOperatorHandle& op, + DispatchKeySet currentDispatchKeySet, + Args... args) const { + // do not use RecordFunction on redispatch +#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG) + DispatchTraceNestingGuard debug_guard; + if (show_dispatch_trace()) { + detail::_print_dispatch_trace( + "[redispatch]", toString(op.operator_name()), currentDispatchKeySet); + } +#endif + const KernelFunction& kernel = + op.operatorDef_->op.lookup(currentDispatchKeySet); + return kernel.template call( + op, currentDispatchKeySet, std::forward(args)...); +} + +inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) + const { + // note: this doesn't need the mutex because write operations on the list keep + // iterators intact. + const auto& entry = op.operatorDef_->op; + auto dispatchKeySet = + entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack); +#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG) + DispatchTraceNestingGuard debug_guard; + if (show_dispatch_trace()) { + detail::_print_dispatch_trace( + "[callBoxed]", toString(op.operator_name()), dispatchKeySet); + } +#endif + const auto& kernel = entry.lookup(dispatchKeySet); +#ifndef PYTORCH_DISABLE_PER_OP_PROFILING + auto step_callbacks = + at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION); + if (C10_UNLIKELY(step_callbacks.has_value() && entry.isObserved())) { + at::RecordFunction guard(std::move(*step_callbacks)); + auto dispatchKey = dispatchKeySet.highestPriorityTypeId(); + auto& schema = op.schema(); + auto schema_ref = std::reference_wrapper(schema); + guard.needsInputs() + ? runRecordFunction( + guard, + schema_ref, + dispatchKey, + dispatchKeySet, + c10::ArrayRef(stack->data(), stack->size())) + : runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet); + + // keeping the guard alive while executing the kernel + kernel.callBoxed(op, dispatchKeySet, stack); + + if (C10_UNLIKELY(guard.needsOutputs())) { + guard.setOutputs(*stack); + } + return; + } +#endif // PYTORCH_DISABLE_PER_OP_PROFILING + kernel.callBoxed(op, dispatchKeySet, stack); +} + +// NB: this doesn't count as a "true" dispatcher jump, so no instrumentation +inline void Dispatcher::callBoxedForDispatchKey( + const OperatorHandle& op, + DispatchKey dk, + Stack* stack) const { + // note: this doesn't need the mutex because write operations on the list keep + // iterators intact. + const auto& entry = op.operatorDef_->op; + // We still compute this as we're obligated to pass it on to the internal + // kernel, if it is a boxed fallback + auto dispatchKeySet = + entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack); + const auto& kernel = ([&]() { + if (op.hasKernelForDispatchKey(dk)) { + return entry.kernelForDispatchKey(dk); + } else { + auto idx = getDispatchTableIndexForDispatchKey(dk); + TORCH_INTERNAL_ASSERT(idx >= 0); + return backendFallbackKernels_[idx].kernel; + } + })(); + kernel.callBoxed(op, dispatchKeySet, stack); +} + +inline void Dispatcher::redispatchBoxed( + const OperatorHandle& op, + DispatchKeySet dispatchKeySet, + Stack* stack) const { + // note: this doesn't need the mutex because write operations on the list keep + // iterators intact. + const auto& entry = op.operatorDef_->op; +#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG) + DispatchTraceNestingGuard debug_guard; + if (show_dispatch_trace()) { + detail::_print_dispatch_trace( + "[redispatchBoxed]", toString(op.operator_name()), dispatchKeySet); + } +#endif + const auto& kernel = entry.lookup(dispatchKeySet); + kernel.callBoxed(op, dispatchKeySet, stack); +} + +} // namespace c10 + +namespace std { + +template <> +struct hash { + size_t operator()(const c10::OperatorHandle& op) const noexcept { + return std::hash{}(static_cast(op.operatorDef_)); + } +}; + +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h new file mode 100644 index 0000000000000000000000000000000000000000..ddd4e653c3f67786dd37e93e3ca1ab1e75acf697 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { + +struct TORCH_API ObservedOperators { + ObservedOperators() = delete; + + static bool isObserved(const OperatorName& name); + + static std::unordered_set& getUnobservedOperatorList(); +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h new file mode 100644 index 0000000000000000000000000000000000000000..fb78faeedd41167e446c29542349acfcb2f2cce5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h @@ -0,0 +1,342 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#ifdef C10_MOBILE +#define C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY +#endif + +namespace c10 { + +class Dispatcher; + +namespace impl { + +// This data structure represents a kernel that was registered to us from a +// user. Unlike KernelFunction, AnnotatedKernel contains some extra metadata +// about the kernel that isn't necessary for actual dispatching (this is why +// we don't put AnnotatedKernel in the actual DispatchTable), but is useful for +// giving good error messages. +struct AnnotatedKernel final { + AnnotatedKernel( + KernelFunction k, + std::unique_ptr s, + std::string d) + : kernel(std::move(k)), + inferred_function_schema(std::move(s)), + debug(std::move(d)) {} + AnnotatedKernel() = default; + KernelFunction kernel; + std::unique_ptr inferred_function_schema; + // A little debug string to help us identify the kernel in question. + // Most importantly it records the TORCH_LIBRARY block that did the + // registration. + std::string debug; +}; + +// This data structure represents operator schema, with metadata specifying +// where the registration of this schema occurred +struct AnnotatedSchema final { + AnnotatedSchema(FunctionSchema s, std::string d) + : schema(std::move(s)), debug(std::move(d)) {} + FunctionSchema schema; + std::string debug; +}; + +// Internal data structure that records information about a specific operator. +// It's not part of the public API; typically, users will interact with +// OperatorHandle instead. +// +// Concurrent writes to OperatorEntry are protected by the GLOBAL Dispatcher +// lock (this is important because some methods in OperatorEntry access +// dispatcher state) +class TORCH_API OperatorEntry final { + public: + explicit OperatorEntry(OperatorName&& operator_name); + + OperatorEntry(const OperatorEntry&) = delete; + OperatorEntry(OperatorEntry&&) noexcept = delete; + OperatorEntry& operator=(const OperatorEntry&) = delete; + OperatorEntry& operator=(OperatorEntry&&) noexcept = delete; + + const FunctionSchema& schema() const { + TORCH_INTERNAL_ASSERT( + schema_.has_value(), + "Tried to access the schema for ", + name_, + " which doesn't have a schema registered yet"); + return schema_->schema; + } + const std::string& debug() const { + TORCH_INTERNAL_ASSERT(schema_.has_value()); + return schema_->debug; + } + bool hasSchema() const { + return schema_.has_value(); + } + + bool isObserved() const { + return is_observed_; + } + + // We may allocate an OperatorEntry for an operator even when we don't + // have a schema. When we receive the schema registration, we post + // facto register a schema. + // + // NB: registerSchema/deregisterSchema are not idempotent; if you + // attempt to register a schema when one is already present or vice + // versa that is an error. (Refcounting for the registrations is + // handled in the OperatorHandle in Dispatcher) + void registerSchema( + FunctionSchema&& /*schema*/, + std::string&& debug, + std::vector tags = {}); + void deregisterSchema(); + + const OperatorName& operator_name() const { + return name_; + } + +#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY + using AnnotatedKernelContainer = std::array; +#else + using AnnotatedKernelContainer = std::list; +#endif + using AnnotatedKernelContainerIterator = AnnotatedKernelContainer::iterator; + + // Why are kernels and fallback asymmetric? It has to do with ownership. + // Kernels and the computed dispatch tables for them are canonically + // owned by OperatorEntry, but backend fallbacks are specified once + // and apply for all operators, so they should be owned by Dispatcher. + // However, the registration of a backend fallback affects the + // state of the computed dispatch table, so when a backend fallback + // is updated, we need to update the operator tables too. Thus, + // registerKernel is the mechanism by which we give kernels to + // operator entry to own (and update dispatch table), but we only + // need a non-owning mechanism to update fallback. + + // Precondition: Dispatcher::mutex_ is held + // Postcondition: caller is responsible for disposing of the kernel + AnnotatedKernelContainerIterator registerKernel( + const Dispatcher& dispatcher, + std::optional dispatch_key, + KernelFunction kernel, + std::optional cpp_signature, + std::unique_ptr inferred_function_schema, + std::string debug); + + // Precondition: Dispatcher::mutex_ is held + void deregisterKernel_( + const Dispatcher& dispatcher, + std::optional dispatch_key, + AnnotatedKernelContainerIterator kernel); + + // Precondition: Dispatcher::mutex_ is held + void updateFallback(const Dispatcher& dispatcher, DispatchKey dispatch_key); + + // Precondition: Dispatcher::mutex_ is held + void updateSchemaAliasAnalysis(AliasAnalysisKind a) { + TORCH_INTERNAL_ASSERT(schema_.has_value()); + schema_->schema.setAliasAnalysis(a); + } + + std::string dumpComputedTable() const; + std::string dumpState() const; + void checkInvariants() const; + + const DispatchKeyExtractor& dispatchKeyExtractor() const { + return dispatchKeyExtractor_; + } + + // Asserts that the given FuncType is correct for calling this operator in an + // unboxed way. + template + inline void assertSignatureIsCorrect() { + assertSignatureIsCorrect( + CppSignature::make(), fn_has_symint::value); + } + + void assertSignatureIsCorrect( + const CppSignature& call_signature, + bool has_symint) const; + + [[noreturn]] void reportError(DispatchKey dispatchKey) const; + + const KernelFunction& lookup(DispatchKeySet ks) const { + const auto idx = ks.getDispatchTableIndexForDispatchKeySet(); + if (C10_UNLIKELY(idx == -1)) { + reportError(ks.highestPriorityTypeId()); + } + const auto& kernel = dispatchTable_[idx]; + // A valid kernel *always* has a boxed kernel and *may* have an + // unboxed kernel. However, we typically do unboxed calls in at:: + // APIs, where the kernel 1) will very likely be valid and 2) + // should have an unboxed kernel. Checking the unboxed kernel + // first will allow us to avoid touching the boxed kernel at all + // in the common case. + if (C10_UNLIKELY(!kernel.isValidUnboxed())) { + if (!kernel.isValid()) { + reportError(ks.highestPriorityTypeId()); + } + } + return kernel; + } + + std::string listAllDispatchKeys() const; + + // Returns true if kernel_ has entry for any key in ks. + // + // Invariant: There are no alias keys in the passed-in dispatch key set. + // Note [No Alias Keys in DispatchKeySet] + // Alias keys should be checked using `hasKernelForDispatchKey` + // Alias keys shouldn't go inside of a DispatchKeySet, since they can + // technically have a value > 63 (causing overflow). + bool hasKernelForAnyDispatchKey(DispatchKeySet ks) const; + // Returns true if kernel_ has entry for a particular key. + bool hasKernelForDispatchKey(DispatchKey k) const; + // Retrieves the kernel entry at a particular key. Symmetric with + // hasKernelForDispatchKey. To get the AnnotatedKernel, see + // getKernelForDispatchKey (private) + const KernelFunction& kernelForDispatchKey(DispatchKey k) const; + // Returns true if the "computed table" has an entry for a particular key. + bool hasComputedKernelForDispatchKey(DispatchKey k) const; + // Returns a KernelFunction corresponding to the kernel in dispatchTable + SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const; + // Returns all the operator tags added at the time of registration + const std::vector& getTags() const; + void setReportErrorCallback_(std::unique_ptr callback); + + template + PyObject* getPythonOp(PyInterpreter* self_interpreter, F slow_accessor) + const { + return py_cache_.ptr_or(self_interpreter, slow_accessor); + } + + private: + OperatorName name_; + std::optional schema_; +#ifndef C10_MOBILE + std::vector tags_; +#endif + std::array dispatchTable_; + DispatchKeyExtractor dispatchKeyExtractor_; + // Pointer to the torch.ops.ns.op.overload object for speed + c10::PyHandleCache py_cache_; + + // kernels_ stores all registered kernels for the corresponding dispatch key + // and catchAllKernels_ stores the catch-all kernels. + // If an operator library gets loaded that overwrites an already existing + // kernel, both kernels will be in that list but only the newer one will be in + // dispatchTable. If any of the kernels go away (say the library gets + // unloaded), we remove the kernel from this list and update the + // dispatchTable if necessary. + // Kernels in the list are ordered by registration time descendingly, + // newer registrations are before older registrations. + // We do not combine dispatchTable and kernels into one hash map because + // kernels is a larger data structure and accessed quite infrequently + // while dispatchTable is accessed often and should be kept small to fit + // into CPU caches. + // Invariants: + // - dispatchTable[dispatch_key] == kernels_[dispatch_key].front() + // - dispatchTable[dispatch_key] does not exist if and only if + // kernels_[dispatch_key] does not exist + // - If kernels_[dispatch_key] exists, then it has elements. + // It is never an empty list. + // + // Why do we do that? + // ----- + // We mostly do this to enable Jupyter notebooks where a cell registering + // a kernel could be executed multiple times and the later execution + // should overwrite the earlier one. Note that this still fails when the + // function schema changed between the executions, but it works as long + // as the function schema didn't change. A better solution would be to + // unload the old extension library from the Jupyter cell when the cell is + // re-executed and then only allow one kernel here, i.e. error if a kernel + // is already registered, but that's a lot of effort to implement and + // currently not high-pri. + ska::flat_hash_map< + DispatchKey, +#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY + // On mobile, we needn't worry about Jupyter notebooks. + std::array +#else + std::list +#endif + > + kernels_; + + const AnnotatedKernel& missingKernel() const; + const AnnotatedKernel& ambiguousAutogradOtherKernel() const; + + // cpp_signature_ stores function signature if any of + // the kernels was created in a way that allowed us to know the function + // signature (i.e. by supplying an unboxed C++ kernel function). + // If this is set, it will be used to check that future kernel + // registrations match and it will be used in unboxed function calls + // to verify their arguments against the known function signature. + struct CppSignatureWithDebug { + CppSignature signature; + std::string debug; + std::optional dispatch_key; + }; + std::optional cpp_signature_; + std::optional sym_cpp_signature_; + + // A Python custom error handler for OperatorEntry::reportError + std::unique_ptr report_error_callback_; + + // Whether this operator needs to be observed with RecordFunction + const bool is_observed_; + + [[noreturn]] void reportSignatureError( + const CppSignature& call_signature, + const CppSignatureWithDebug& saved_signature) const; + const KernelFunction& computeDispatchTableEntry( + const c10::Dispatcher& dispatcher, + DispatchKey dispatch_key) const; + std::pair + computeDispatchTableEntryWithDebug( + const c10::Dispatcher& dispatcher, + DispatchKey dispatch_key) const; + // This function re-establishes the invariant that dispatchTable + // contains the front element from the kernels list for a given runtime + // dispatch key. + void updateDispatchTableEntry_( + const c10::Dispatcher& dispatcher, + DispatchKey dispatch_key); + // Like above, but also handles alias dispatch keys. + void updateDispatchTable_( + const c10::Dispatcher& dispatcher, + DispatchKey dispatch_key); + // Like above, but for ALL entries in the dispatch table. + void updateDispatchTableFull_(const c10::Dispatcher& dispatcher); + // Retrieves a pointer to AnnotatedKernel at + // kernels_.at(dispatch_key).front(). + const AnnotatedKernel* getKernelForDispatchKey( + DispatchKey dispatch_key) const; +}; + +} // namespace impl +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h new file mode 100644 index 0000000000000000000000000000000000000000..7d506e7a43784a38e1646e6ced419bdf8f080aac --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { + +enum class AliasAnalysisKind : uint8_t { + INTERNAL_SPECIAL_CASE, + CONSERVATIVE, // The most conservative alias analysis type, assumes + // side-effects. This is the default analysis. + FROM_SCHEMA, + PURE_FUNCTION +}; + +#if !defined(_MSC_VER) +constexpr // Our current MSVC version has a bug that doesn't allow this to be + // constexpr. +#endif + inline const char* + toString(AliasAnalysisKind aliasAnalysisKind) { + return (aliasAnalysisKind == AliasAnalysisKind::CONSERVATIVE) ? "CONSERVATIVE" + : (aliasAnalysisKind == AliasAnalysisKind::FROM_SCHEMA) ? "FROM_SCHEMA" + : (aliasAnalysisKind == AliasAnalysisKind::PURE_FUNCTION) + ? "PURE_FUNCTION" + : (aliasAnalysisKind == AliasAnalysisKind::INTERNAL_SPECIAL_CASE) + ? "INTERNAL_SPECIAL_CASE" + : "UNKNOWN"; +} + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h new file mode 100644 index 0000000000000000000000000000000000000000..c66b08e8350e355de45777c2e15f71dcdbb8a2f2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { + +class RegistrationHandleRAII final { + public: + explicit RegistrationHandleRAII(std::function onDestruction) + : onDestruction_(std::move(onDestruction)) {} + + ~RegistrationHandleRAII() { + if (onDestruction_) { + onDestruction_(); + } + } + + RegistrationHandleRAII(const RegistrationHandleRAII&) = delete; + RegistrationHandleRAII& operator=(const RegistrationHandleRAII&) = delete; + + RegistrationHandleRAII(RegistrationHandleRAII&& rhs) noexcept + : onDestruction_(std::move(rhs.onDestruction_)) { + rhs.onDestruction_ = nullptr; + } + + RegistrationHandleRAII& operator=(RegistrationHandleRAII&& rhs) noexcept { + onDestruction_ = std::move(rhs.onDestruction_); + rhs.onDestruction_ = nullptr; + return *this; + } + + private: + std::function onDestruction_; +}; + +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h new file mode 100644 index 0000000000000000000000000000000000000000..41936f74d3f79450df15220020866d9ca2de492a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h @@ -0,0 +1,86 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +/* + * [Note: hacky wrapper removal for optional tensor] + * + * The kernel implementation takes an optional tensor marked in the schema as + * Tensor? but the C++ function takes Tensor instead of the std::optional + * expected by the dispatcher. + * + * To remove the hacky wrapper, the C++ function is changed to take + * std::optional and unwrap the Tensor value at the beginning of + * the function, e.g.: + * > c10::MaybeOwned weight_maybe_owned = + * > at::borrow_from_optional_tensor(weight_opt); + * > const Tensor& weight = *weight_maybe_owned; + * + * We may want to make the kernel handle optional directly without + * going through the creation of a default-constructed Tensor in + * at::borrow_from_optional_tensor. + */ + +/* + * [Note: hacky wrapper removal for TensorOptions] + * + * The kernel implementation takes a TensorOptions argument but the dispatcher + * expects separate arguments for dtype, layout, device, pin_memory. + * + * To remove the hacky wrapper, the kernel implementation is changed to take + * the 4 arguments (dtype, layout, device, pin_memory), and assemble the + * TensorOptions value at the beginning of the function, e.g.: + * > TensorOptions options = TensorOptions().dtype(dtype).layout(layout) + * > .device(device).pinned_memory(pin_memory); + * + * We may want make the kernel handle these parameters directly without going + * through the creation of a TensorOptions value. + */ + +namespace c10::impl { + +TORCH_API void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName); + +inline void check_and_update_common_device(std::optional& common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) { + // TODO: Remove this once the following issue is addressed: + // https://github.com/pytorch/pytorch/issues/57380 + if (!tensor.defined()) { + return; + } + + if (!common_device.has_value()) { + common_device = tensor.device(); + return; + } + + if (C10_UNLIKELY(common_device != tensor.device())) { + common_device_check_failure(*common_device, tensor, methodName, argName); + } +} + +inline void check_and_update_common_device(std::optional& common_device, const std::optional& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) { + if (tensor.has_value()) { + check_and_update_common_device(common_device, tensor.value(), methodName, argName); + } +} + +inline void check_and_update_common_device(std::optional& common_device, at::ITensorListRef tensors, at::CheckedFrom methodName, at::CheckedFrom argName) { + for (const auto& tensor : tensors) { + check_and_update_common_device(common_device, tensor, methodName, argName); + } +} + +inline void check_and_update_common_device(std::optional& common_device, const List>& tensors, at::CheckedFrom methodName, at::CheckedFrom argName) { + for (const auto& tensor : tensors) { + check_and_update_common_device(common_device, tensor, methodName, argName); + } +} +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h new file mode 100644 index 0000000000000000000000000000000000000000..bb01fcab0b4d7314188acbd761f61a12de6d14d8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h @@ -0,0 +1,162 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +/** + * This file contains functionality to take a C++ function and infer its + * c10::FunctionSchema. + */ + +#include +#include + +namespace c10 { +namespace detail::infer_schema { + +/// The templated inference code creates `ArgumentDef` instead of `Argument`, +/// because that can be constructed at compile time and has a much smaller +/// binary size than having calls to `Argument` constructors in the template. +/// Creating `Argument` objects from `ArgumentDef` can then be done at +/// runtime in a non-templated way. +struct ArgumentDef final { + using GetTypeFn = TypePtr(); + GetTypeFn* getTypeFn; + GetTypeFn* getFakeTypeFn; + constexpr ArgumentDef(): getTypeFn(nullptr), getFakeTypeFn(nullptr) {} + explicit constexpr ArgumentDef(GetTypeFn *getTypeFn, GetTypeFn *getFakeTypeFn): getTypeFn(getTypeFn), getFakeTypeFn(getFakeTypeFn) {} +}; + +template +struct bool_t {}; +template<> struct bool_t : std::true_type {}; +template<> struct bool_t : std::false_type {}; + +/// Checks the static C++ types `Types` for correctness to catch common error cases. +template +constexpr int checkStaticTypes() { + // Give nice error messages for some of the common error cases. + // Use a LOUD ERROR MESSAGE SO USERS SEE THE STATIC_ASSERT + static_assert(std::conjunction_v< + bool_t || std::is_same_v || std::is_same_v || std::is_same_v>... + >, "INVALID TYPE: Only int8_t, int64_t and bool are supported as an integral argument type"); + static_assert(std::conjunction_v< + bool_t>... + >, "INVALID TYPE: float is not supported as an argument type, use double instead"); + return 0; +} + +template +constexpr std::array createArgumentVectorFromTypes(std::index_sequence /*unused*/) { + return ( + // Check types for common errors + checkStaticTypes(), + + // Create the return value + std::array{ + ArgumentDef(&getTypePtrCopy>, &getFakeTypePtrCopy>)...} + ); +} + +/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified +/// as template arguments. +template struct createArguments final {}; +template +struct createArguments> final { + static constexpr std::array call() { + return createArgumentVectorFromTypes( + std::make_index_sequence() + ); + } +}; + +/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified +/// as a tuple (i.e. in the way c10 kernels return values). +/// It can be a tuple if there's three output arguments with types A, B, C. +/// It can be an empty tuple<>, or void for kernels that don't return anything. +/// It can be a single type A (i.e. no tuple) for the case where a kernel just +/// returns one value. +template struct createReturns final {}; + +template +struct createReturns, void> final { + static constexpr std::array call() { + return createArgumentVectorFromTypes( + std::make_index_sequence() + ); + } +}; + +template +struct createReturns && !guts::is_instantiation_of::value>> final { + static constexpr std::array call() { + return createReturns>::call(); + } +}; + +template<> +struct createReturns final { + static constexpr std::array call() { + return createReturns>::call(); + } +}; + +template +struct createSingleReturn { + static constexpr std::array call() { + return createArgumentVectorFromTypes(std::make_index_sequence<1>()); + } +}; + +TORCH_API FunctionSchema make_function_schema(std::string&& name, std::string&& overload_name, c10::ArrayRef arguments, c10::ArrayRef returns); +TORCH_API FunctionSchema make_function_schema(c10::ArrayRef arguments, c10::ArrayRef returns); + +/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a +/// function. Flattens std::tuple returns into multiple return types +template +FunctionSchema createFunctionSchemaFromTraitsFlattenedReturns() { + using ReturnType = typename FunctionTraits::return_type; + using ParameterTypes = typename FunctionTraits::parameter_types; + + // arguments and returns are computed into a std::array at compile time and embedded into the binary. + // The only code executed at runtime here is the one that creates a std::vector + // of the arguments/returns from the std::array. + constexpr auto arguments = createArguments::call(); + constexpr auto returns = createReturns::call(); + + return make_function_schema(arguments, returns); +} + +/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a +/// function. Preserves std::tuple returns as a Tuple return type +template +FunctionSchema createFunctionSchemaFromTraitsSingleReturn(std::string&& name, std::string&& overload_name) { + using ReturnType = typename FunctionTraits::return_type; + using ParameterTypes = typename FunctionTraits::parameter_types; + + // arguments and returns are computed into a std::array at compile time and embedded into the binary. + // The only code executed at runtime here is the one that creates a std::vector + // of the arguments/returns from the std::array. + constexpr auto arguments = createArguments::call(); + constexpr auto returns = createSingleReturn::call(); + + return make_function_schema(std::move(name), std::move(overload_name), arguments, returns); +} + +} + +template +FunctionSchema inferFunctionSchemaFlattenedReturns() { + return detail::infer_schema::createFunctionSchemaFromTraitsFlattenedReturns>(); +} + +template +FunctionSchema inferFunctionSchemaSingleReturn(std::string&& name, std::string&& overload_name) { + return detail::infer_schema::createFunctionSchemaFromTraitsSingleReturn>(std::move(name), std::move(overload_name)); +} + +TORCH_API std::optional findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified); + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h new file mode 100644 index 0000000000000000000000000000000000000000..85169f8a1ab8684c84e08188ef66fe9e945ed7ec --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h @@ -0,0 +1,186 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// TODO: unify to C10_MOBILE. In theory this header could be used in OSS. +#ifdef TEMPLATE_SELECTIVE_BUILD +#include +#endif + +/** + * This header implements functionality to build PyTorch with only a certain + * set of operators (+ dependencies) included. + * + * - Build with -DTORCH_OPERATOR_WHITELIST="aten::add;aten::sub" and only these + * two ops will be included in your build. The allowlist records operators + * only, no overloads; if you include aten::add, all overloads of aten::add + * will be included. + * + * Internally, this is done by removing the operator registration calls + * using compile time programming, and the linker will then prune all + * operator functions that weren't registered. + * See Note [Selective build] for more details + * + * WARNING: The allowlist mechanism doesn't work for all ways you could go about + * registering an operator. If the dispatch key / operator name is not + * sufficiently obvious at compile time, then the allowlisting mechanism + * will fail (and the operator will be included in the binary anyway). + */ + +#include +#include +#include + + +#if defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE) +#include +#endif + +namespace c10::impl { + +constexpr bool allowlist_contains(std::string_view allowlist, std::string_view item); // Forward Declare + +/** + * In selective build mode returns true/false depending on whether a build + * feature is available or not. + * + * In instrumenting mode (tracing mode), always returns true, and doesn't + * trigger any side effects. + */ +constexpr bool is_build_feature_available(const char* name) { +#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE) + // Selective Build mode. +#if !defined(TORCH_BUILD_FEATURE_ALLOWLIST) + (void)name; + return true; +#else + return allowlist_contains( + C10_STRINGIZE(TORCH_BUILD_FEATURE_ALLOWLIST), + name); +#endif + +#else + // Instrumenting mode. + (void)name; + return true; +#endif +} + +[[noreturn]] void build_feature_required_feature_not_available(const char* feature); + +/** + * Use BUILD_FEATURE_REQUIRED macro in user-code. + * + * In selective build mode becomes a no-op if the build feature passed + * in is available. If not available, throws an exception (c10::Error). + * The compiler is able to perform dead code elimination for code + * following this method if the build feature is not available. + * + * In instrumenting mode (tracing mode), registers (as a side effect) + * the presence of this specific build feature being triggered. + */ +#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE) // selective build mode + +#if defined(TORCH_BUILD_FEATURE_ALLOWLIST) +#define BUILD_FEATURE_REQUIRED(NAME) \ + if (!c10::impl::is_build_feature_available(NAME)) { \ + ::c10::impl::build_feature_required_feature_not_available(NAME); \ + } +#else // Everything trivially selected +#define BUILD_FEATURE_REQUIRED(NAME) + +#endif + +#else // trace mode +#define BUILD_FEATURE_REQUIRED(NAME) \ + RECORD_FUNCTION_WITH_SCOPE( \ + at::RecordScope::BUILD_FEATURE, \ + std::string(NAME), \ + {}); +#endif + +// Use this macro, and not is_build_feature_available +#define BUILD_FEATURE_AVAILABLE(NAME) ::c10::impl::is_build_feature_available(NAME) + +// returns true iff allowlist contains item +// allowlist_contains("a;bc;d", "bc") == true +constexpr bool allowlist_contains(std::string_view allowlist, std::string_view item) { + //Choose a really big value for next so that if something goes wrong + //this code will blow up in a hopefully detectable way. + size_t next = std::numeric_limits::max(); + for (size_t cur = 0; cur <= allowlist.size(); cur = next) { + next = allowlist.find(';', cur); + if (next != std::string_view::npos) { + if (allowlist.substr(cur, next - cur) == item) { + return true; + } + next++; + } else { + if (allowlist.substr(cur) == item) { + return true; + } + break; + } + } + return false; +} + +// Returns true iff the given op name is on the allowlist +// and should be registered +constexpr bool op_allowlist_check(std::string_view op_name [[maybe_unused]]) { + assert(op_name.find("::") != std::string_view::npos); + // Use assert() instead of throw() due to a gcc bug. See: + // https://stackoverflow.com/questions/34280729/throw-in-constexpr-function + // https://github.com/fmtlib/fmt/issues/682 + assert(op_name.find('(') == std::string_view::npos); +#if !defined(TORCH_OPERATOR_WHITELIST) + // If the TORCH_OPERATOR_WHITELIST parameter is not defined, + // all ops are to be registered + return true; +#else + return allowlist_contains( + C10_STRINGIZE(TORCH_OPERATOR_WHITELIST), + // This function is majorly used for mobile selective build with + // root operators, where the overload is included in the allowlist. + op_name); + // // Strip overload name (as allowlist doesn't contain overloads) + // // Another function based on this may be added when there's usage + // // on op names without overload. + // OperatorNameView::parse(op_name).name); +#endif +} + +// Returns true iff the given schema string is on the allowlist +// and should be registered +constexpr bool schema_allowlist_check(std::string_view schema) { +#if defined(TORCH_FORCE_SCHEMA_REGISTRATION) + return true; +#else + return op_allowlist_check(schema.substr(0, schema.find('('))); +#endif +} + +// Returns true iff the given custom class name is on the allowlist +// and should be registered +constexpr bool custom_class_allowlist_check(std::string_view custom_class_name [[maybe_unused]]) { +#if !defined(TORCH_CUSTOM_CLASS_ALLOWLIST) + // If the TORCH_CUSTOM_CLASS_ALLOWLIST parameter is not defined, + // all custom classes are to be registered + return true; +#else + return allowlist_contains( + C10_STRINGIZE(TORCH_CUSTOM_CLASS_ALLOWLIST), + custom_class_name); +#endif +} + +// schema_allowlist_check() implicitly depends on a macro, TORCH_OPERATOR_WHITELIST. +// Add this API to pass arbitrary allowlist. +constexpr bool op_allowlist_contains_name_in_schema(std::string_view allowlist, std::string_view schema) { + return allowlist_contains(allowlist, schema.substr(0, schema.find('('))); +} + +} // namespace c10::impl + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h new file mode 100644 index 0000000000000000000000000000000000000000..6e5f8ffe59479fb8e8da0dcf4716b6d14c9d15db --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h @@ -0,0 +1,599 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +/** + * Include this file if you want to register operators. It includes all + * functionality needed to do so for you. + */ + +#include +#include +#include +#include +#include +#include +#include +#if defined(EXPOSE_C2_OPS) || !defined(CAFFE2_IS_XPLAT_BUILD) +#include +#endif +#include + +namespace c10 { + +namespace detail { +// The first argument of the schema might be of type DispatchKeySet, in which case we remove it. +// We do this because every argument in a function schema is expected to be convertible +// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of. +// See Note [Plumbing Keys Through The Dispatcher] +template +std::unique_ptr inferFunctionSchemaFromFunctor() { + using func_type = typename c10::remove_DispatchKeySet_arg_from_func::func_type; + return std::make_unique(inferFunctionSchemaFlattenedReturns()); +} +} + +/** + * An instance of this class handles the registration for one or more operators. + * Make sure you keep the RegisterOperators instance around since it will + * deregister the operator it's responsible for in its destructor. + * + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU)); + */ +class TORCH_API RegisterOperators final { +public: + RegisterOperators() = default; + ~RegisterOperators() = default; + + RegisterOperators(const RegisterOperators&) = delete; + RegisterOperators& operator=(const RegisterOperators&) = delete; + RegisterOperators(RegisterOperators&&) noexcept = default; + RegisterOperators& operator=(RegisterOperators&&) noexcept = default; + + class TORCH_API Options final { + public: + Options(const Options&) = delete; + Options(Options&&) noexcept = delete; + Options& operator=(const Options&) = delete; + Options& operator=(Options&&) noexcept = delete; + + // internal-only for registering stack based kernels + template + Options&& kernel(DispatchKey dispatch_key) && { + return std::move(*this).kernel(dispatch_key, KernelFunction::makeFromBoxedFunction(), std::nullopt, nullptr); + } + + // internal-only for registering stack based catch-all kernels + template + Options&& catchAllKernel() && { + return std::move(*this).kernel(std::nullopt, KernelFunction::makeFromBoxedFunction(), std::nullopt, nullptr); + } + + // internal only for registering caffe2 ops + Options&& schema(FunctionSchema&& schema) { + TORCH_CHECK(!schemaOrName_.has_value(), "You can only specify the schema once per operator registration."); + schemaOrName_ = FunctionSchema(std::move(schema)); + return std::move(*this); + } + + /** + * Use this to specify the schema for an operator. You can also specify + * the operator name only to have the function signature part of the + * schema be inferred from the kernel function. + * + * Example: + * + * > // Infer function signature from my_kernel_cpu + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU)); + * > + * > + * > // Explicitly specify full schema + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op(Tensor a) -> Tensor") + * > .kernel(DispatchKey::CPU)); + */ + Options&& schema(const std::string& schemaOrName) { + TORCH_CHECK(!schemaOrName_.has_value(), "Tried to register operator ", schemaOrName," but specified schema multiple times. You can only specify the schema once per operator registration."); + + #if !defined(EXPOSE_C2_OPS) && defined(CAFFE2_IS_XPLAT_BUILD) + throw std::logic_error("Tried to register operator " + schemaOrName + ". We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build."); + #else + schemaOrName_ = torch::jit::parseSchemaOrName(schemaOrName); + #endif + + return std::move(*this); + } + + /** + * Use this to register an operator whose kernel is implemented as a functor. + * The kernel is only called for inputs matching the given dispatch key. + * You can register multiple kernels for different dispatch keys. + * + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU)); + * + * The functor constructor can take arguments to configure the kernel. + * The arguments are defined in the kernel registration. + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > explicit my_kernel_cpu(std::string some_configuration, int a, bool b) + * > : ... {...} + * > + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU, "some_configuration", 3, true)); + */ + template + // enable_if: only enable it if KernelFunctor is actually a functor + std::enable_if_t::value, Options&&> kernel(DispatchKey dispatch_key, ConstructorParameters&&... constructorParameters) && { + static_assert(std::is_base_of_v, "Tried to register a kernel functor using the kernel() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + static_assert(std::is_constructible_v, "Wrong argument list for constructor of kernel functor. The arguments to kernel(arguments...) must match one of the constructors of Functor."); + + return std::move(*this).kernel( + dispatch_key, + KernelFunction::makeFromUnboxedFunctor(std::make_unique(std::forward(constructorParameters)...)), + impl::CppSignature::make(), + detail::inferFunctionSchemaFromFunctor() + ); + } + + /** + * Use this to register an operator whose kernel is implemented as a functor. + * The kernel is a catch-all kernel, meaning it's called independent from + * the input. Dispatch is disabled for this operator. + * + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .catchAllKernel()); + * + * The functor constructor can take arguments to configure the kernel. + * The arguments are defined in the kernel registration. + * Example: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > explicit my_kernel_cpu(std::string some_configuration, int a, bool b) + * > : ... {...} + * > + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .catchAllKernel("some_configuration", 3, true)); + */ + template + // enable_if: only enable it if KernelFunctor is actually a functor + std::enable_if_t::value, Options&&> catchAllKernel(ConstructorParameters&&... constructorParameters) && { + static_assert(std::is_base_of_v, "Tried to register a kernel functor using the kernel() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); + static_assert(std::is_constructible_v, "Wrong argument list for constructor of kernel functor. The arguments to kernel(arguments...) must match one of the constructors of Functor."); + + return std::move(*this).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedFunctor(std::make_unique(std::forward(constructorParameters)...)), + impl::CppSignature::make(), + detail::inferFunctionSchemaFromFunctor() + ); + } + + /** + * Use this to register an operator whose kernel is implemented by a function. + * The kernel is only called for inputs matching the given dispatch key. + * You can register multiple kernels for different dispatch keys. + * + * Example: + * + * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU)); + */ + template + // enable_if: only enable it if FuncType is actually a function + std::enable_if_t::value, Options&&> kernel(DispatchKey dispatch_key) && { + static_assert(!std::is_same_v, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); + static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr"); + + return std::move(*this).kernel( + dispatch_key, + KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoFunctor + detail::inferFunctionSchemaFromFunctor>::type>() + ); + } + + /** + * Use this to register an operator whose kernel is implemented by a function. + * The kernel is a catch-all kernel, meaning it's called independent from + * the input. Dispatch is disabled for this operator. + * + * Example: + * + * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } + * > + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .catchAllKernel()); + */ + template + // enable_if: only enable it if FuncType is actually a function + std::enable_if_t::value, Options&&> catchAllKernel() && { + static_assert(!std::is_same_v, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); + static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr"); + + return std::move(*this).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoFunctor + detail::inferFunctionSchemaFromFunctor>::type>() + ); + } + + template + // enable_if: only enable it if FuncType is actually a function + std::enable_if_t::value, Options&&> kernel(DispatchKey dispatch_key, FuncType* kernel_func) && { + static_assert(!std::is_same_v, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); + TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr"); + + return std::move(*this).kernel( + dispatch_key, + KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoFunctor + detail::inferFunctionSchemaFromFunctor>>() + ); + } + + template + // enable_if: only enable it if FuncType is actually a function + std::enable_if_t::value, Options&&> catchAllKernel(FuncType* kernel_func) && { + static_assert(!std::is_same_v, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API."); + TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr"); + + return std::move(*this).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoFunctor + detail::inferFunctionSchemaFromFunctor>>() + ); + } + + /** + * Use this to register an operator whose kernel is implemented as a lambda. + * The kernel is only called for inputs matching the given dispatch key. + * You can register multiple kernels for different dispatch keys. + * + * The lambda must be stateless, i.e. not have a capture. If your kernel + * needs to store some configuration parameters, write the kernel as a + * functor instead. + * + * Example: + * + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .kernel(DispatchKey::CPU, [] (Tensor a) -> Tensor {...})); + */ + template + // enable_if: only enable it if Lambda is a functor (note: lambdas are functors) + std::enable_if_t< + guts::is_functor>::value + && !std::is_same_v>::func_type, KernelFunction::BoxedKernelFunction>, + Options&&> kernel(DispatchKey dispatch_key, Lambda&& functor) && { + static_assert(!std::is_base_of_v>, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel() API instead."); + + // We don't support stateful lambdas (i.e. lambdas with a capture), because their + // behavior would be nonobvious. A functor kernel with cache gets a new instance of + // its cache each time the kernel is looked up from the dispatch table. + // A lambda with a capture would be global and share its capture between all kernel lookups. + // So, instead of making users having to think about it (including the thread-safety + // issues this causes), let's just forbid stateful lambdas altogether. + static_assert(guts::is_stateless_lambda>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel() instead."); + + return std::move(*this).kernel( + dispatch_key, + KernelFunction::makeFromUnboxedLambda(std::forward(functor)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + ); + } + + /** + * Use this to register an operator whose kernel is implemented as a lambda. + * The kernel is a catch-all kernel, meaning it's called independent from + * the input. Dispatch is disabled for this operator. + * + * The lambda must be stateless, i.e. not have a capture. If your kernel + * needs to store some configuration parameters, write the kernel as a + * functor instead. + * + * Example: + * + * > static auto registry = c10::RegisterOperators() + * > .op(c10::RegisterOperators::options() + * > .schema("my_op") + * > .catchAllKernel([] (Tensor a) -> Tensor {...})); + */ + template + // enable_if: only enable it if Lambda is a functor (note: lambdas are functors) + std::enable_if_t< + guts::is_functor>::value + && !std::is_same_v>::func_type, KernelFunction::BoxedKernelFunction>, + Options&&> catchAllKernel(Lambda&& lambda) && { + static_assert(!std::is_base_of_v>, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel() API instead."); + + // We don't support stateful lambdas (i.e. lambdas with a capture), because their + // behavior would be nonobvious. + // A lambda with a capture would be global and share its capture between all kernel lookups. + // This would be a likely source for unexpected race conditions, so we forbid it. + // If a kernel really needs global state, they can just have regular global state + // in their .cpp file next to the kernel lambda. + static_assert(guts::is_stateless_lambda>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel() instead."); + + return std::move(*this).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedLambda(std::forward(lambda)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + ); + } + + Options&& aliasAnalysis(AliasAnalysisKind aliasAnalysisKind) && { + TORCH_CHECK(!aliasAnalysisKind_.has_value(), "You can only call aliasAnalysis() once per operator registration."); + aliasAnalysisKind_ = aliasAnalysisKind; + return std::move(*this); + } + + private: + Options&& kernel(std::optional dispatch_key, KernelFunction&& func, std::optional cpp_signature, std::unique_ptr&& inferred_function_schema) && { + KernelRegistrationConfig config; + config.dispatch_key = dispatch_key; + config.func = std::move(func); + config.cpp_signature = cpp_signature; + config.inferred_function_schema = std::move(inferred_function_schema); + kernels.push_back(std::move(config)); + return std::move(*this); + } + + Options() + : schemaOrName_(std::nullopt) + , aliasAnalysisKind_(std::nullopt) + {} + + // KernelRegistrationConfig accumulates all information from the config + // parameters passed to a RegisterOperators::op() call into one object. + struct KernelRegistrationConfig final { + KernelRegistrationConfig() + : dispatch_key(std::nullopt) + , cpp_signature(std::nullopt) + , inferred_function_schema(nullptr) + {} + + std::optional dispatch_key; + KernelFunction func; + std::optional cpp_signature; + std::unique_ptr inferred_function_schema; + }; + + std::optional> schemaOrName_; + + std::vector kernels; + std::optional aliasAnalysisKind_; + friend class RegisterOperators; + friend class Library; + }; + + /** + * Call this to get an instance of registration options, which + * can be passed to a call to RegisterOperators::op() to specify + * these options for the operator registration. + * See class doc comment for examples. + */ + static Options options() { + return {}; + } + + /** + * Call this to register an operator. See class doc comment for examples. + */ + RegisterOperators&& op(Options&& options) && { + checkSchemaAndRegisterOp_(std::move(options)); + return std::move(*this); + } + + // Regular mutator version of the && version above + RegisterOperators& op(Options&& options) & { + checkSchemaAndRegisterOp_(std::move(options)); + return *this; + } + + /** + * This is a shorthand for RegisterOperators::op(Options) where you can + * specify the operator schema outside of the options parameter. + * See class doc comment for examples. + */ + RegisterOperators&& op(const std::string& schemaOrName, Options&& options = RegisterOperators::options()) && { + return std::move(*this).op(std::move(options).schema(schemaOrName)); + } + + // internal only for registering caffe2 ops + RegisterOperators&& op(FunctionSchema schema, Options&& options) && { + return std::move(*this).op(std::move(options).schema(std::move(schema))); + } + + template + explicit RegisterOperators(const std::string& schemaOrName, FuncType&& func, Options&& options = RegisterOperators::options()) + : RegisterOperators() { + std::move(*this).op(schemaOrName, std::forward(func), std::move(options)); + } + + /** + * This API registers an operator based on a kernel function pointer. + * + * Given a kernel + * + * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} } + * + * This API looks like: + * + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", &my_kernel_cpu); + * + * If your kernel is small and the overhead of calling it matters, + * then this API might be the wrong choice since the following API + * has a slightly lower overhead for calling into the kernel: + * + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel()); + * + * Or, alternatively, write your kernel as a functor: + * + * > namespace { + * > class my_kernel_cpu final : public c10::OperatorKernel { + * > public: + * > Tensor operator()(Tensor a, Tensor b) {...} + * > }; + * > } + * > + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .kernel()); + */ + template + // enable_if: only enable it if FuncType is actually a function, but not a stack based BoxedKernelFunction. + std::enable_if_t::value && !std::is_same_v, RegisterOperators&&> + op(const std::string& schemaOrName, FuncType* func, Options&& options = RegisterOperators::options()) && { + constexpr bool AllowLegacyTypes = true; + return std::move(*this).op(std::move(options).schema(schemaOrName).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedRuntimeFunction(func), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + )); + } + + /** + * This API registers an operator based on a kernel lambda. + * + * This API looks like: + * + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", [] (Tensor a, Tensor b) {...}); + * + * This is equivalent to: + * + * > static auto registry = c10::RegisterOperators() + * > .op("my_op", c10::RegisterOperators::options() + * > .catchAllKernel([] (Tensor a, Tensor b) {...})); + * + */ + template + // enable_if: only enable it if Lambda is actually a stateless lambda + std::enable_if_t::value && guts::is_stateless_lambda>::value, RegisterOperators&&> + op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && { + static_assert(!std::is_base_of_v, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead."); + + constexpr bool AllowLegacyTypes = true; + return std::move(*this).op(std::move(options).schema(schemaOrName).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedLambda(std::forward(lambda)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + )); + } + + template + C10_DEPRECATED_MESSAGE("Registering operator kernels with stateful lambdas (i.e. lambdas with a capture) has non-obvious behavior. This is deprecated. Please use a lambda without a capture or a functor class instead.") + // enable_if: only enable it if Lambda is actually a functor but not a stateless lambda + std::enable_if_t::value && !guts::is_stateless_lambda>::value, RegisterOperators&&> + op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && { + static_assert(!std::is_base_of_v, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead."); + + constexpr bool AllowLegacyTypes = true; + return std::move(*this).op(std::move(options).schema(schemaOrName).kernel( + std::nullopt, + KernelFunction::makeFromUnboxedLambda(std::forward(lambda)), + impl::CppSignature::make(), + // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor + detail::inferFunctionSchemaFromFunctor>>() + )); + } + +private: + void checkSchemaAndRegisterOp_(Options&& config); + + static c10::FunctionSchema inferSchemaFromKernels_(const OperatorName& opNameStr, const Options& options); + void checkNoDuplicateKernels_(const Options& options); + void registerOp_(Options&& options); + + std::vector registrars_; +}; + +} // namespace c10 + +namespace torch { + // Old-style API + using RegisterOperators = c10::RegisterOperators; +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/FlushDenormal.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/FlushDenormal.h new file mode 100644 index 0000000000000000000000000000000000000000..5e3d0ffbd71a5a4dacd80594e3c7222fe2be0a8e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/FlushDenormal.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/// Flush-To-Zero and Denormals-Are-Zero mode +/// +/// Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) are modes that bypass +/// IEEE 754 methods of dealing with denormal floating-point numbers on x86-64 +/// and some x86 CPUs. They result in reduced precision for values near zero, +/// but increased performance. +/// +/// See https://software.intel.com/en-us/articles/x87-and-sse-floating-point-assists-in-ia-32-flush-to-zero-ftz-and-denormals-are-zero-daz + +namespace at::cpu { + +bool set_flush_denormal(bool on); + +} // namespace at::cpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/Utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/Utils.h new file mode 100644 index 0000000000000000000000000000000000000000..b2b9a3e9c1051bcf10b0ac1ea57364771062f2b6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/Utils.h @@ -0,0 +1,38 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include + +namespace at::cpu { + +TORCH_API bool is_avx2_supported(); +TORCH_API bool is_avx512_supported(); + +// Detect if CPU support Vector Neural Network Instruction. +TORCH_API bool is_avx512_vnni_supported(); + +// Detect if CPU supports AVX512_BF16 ISA +TORCH_API bool is_avx512_bf16_supported(); + +// Detect if CPU support Advanced Matrix Extension. +TORCH_API bool is_amx_tile_supported(); + +// Detect if CPU support Advanced Matrix Extension for fp16. +TORCH_API bool is_amx_fp16_supported(); + +// Enable the system to use AMX instructions. +TORCH_API bool init_amx(); + +// Get the L1 cache size per core in Byte +TORCH_API uint32_t L1d_cache_size(); + +// Get the L2 cache size per core in Byte +TORCH_API uint32_t L2_cache_size(); + +} // namespace at::cpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/vml.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/vml.h new file mode 100644 index 0000000000000000000000000000000000000000..600c38cfe964817f99c81c8c5c4edbeaabee3fea --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/vml.h @@ -0,0 +1,175 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +// This header implements various unary operations using a MKL VML style +// interface. + +// It implements various functions with a simple interface +// For example it enables the user to call vsin(float* out, const float* in, +// size) This functions takes a pointer to a continuous output array of floats and +// a constant input array. It will then apply sin to each value in the input +// array and write the result into the output array. out and in may point to the +// same memory, i.e. this fully supports in-place operations. These functions +// also implement their own parallelization, so take precautions when calling +// these from threaded functions. + +// When MKL is available it will call into MKL's VML library similar to NumPy +// If MKL is not available it will use SLEEF. + +// This file might be compiled under AVX or AVX2 when called from e.g. +// UnaryOpsKernel.cpp + +#include +#include +#include +#include +#include + +#if AT_MKL_ENABLED() && !defined(__APPLE__) +#include +#endif + + +namespace at::vml { +inline namespace CPU_CAPABILITY { + +using namespace vec; + +template +inline void vrsqrt(scalar_t* out, scalar_t* in, int64_t size) { + parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) { + map( + [](const Vectorized& x) { + return Vectorized((scalar_t)1) / x.sqrt(); + }, + out + begin, + in + begin, + end - begin); + }); +} + +// NB: We ignore numerical errors by convention and leave them to the user + +#define IMPLEMENT_VML(op) \ + template \ + inline void v##op(scalar_t* out, const scalar_t* in, int64_t size) { \ + using vec_t = Vectorized>; \ + vec::map([](vec_t x) { return x.op(); }, out, in, size); \ + } \ + +IMPLEMENT_VML(abs) +IMPLEMENT_VML(acos) +IMPLEMENT_VML(asin) +IMPLEMENT_VML(atan) +IMPLEMENT_VML(atanh) +IMPLEMENT_VML(ceil) +IMPLEMENT_VML(cos) +// IMPLEMENT_VML(cosh) +IMPLEMENT_VML(erf) +IMPLEMENT_VML(erfc) +IMPLEMENT_VML(erfinv) +IMPLEMENT_VML(exp) +IMPLEMENT_VML(expm1) +IMPLEMENT_VML(floor) +IMPLEMENT_VML(i0) +IMPLEMENT_VML(i0e) +IMPLEMENT_VML(digamma) +IMPLEMENT_VML(reciprocal) +IMPLEMENT_VML(log) +IMPLEMENT_VML(log10) +IMPLEMENT_VML(log1p) +IMPLEMENT_VML(log2) +IMPLEMENT_VML(neg) +IMPLEMENT_VML(sin) +// IMPLEMENT_VML(sinh) +IMPLEMENT_VML(sqrt) +IMPLEMENT_VML(round) +IMPLEMENT_VML(rsqrt) +IMPLEMENT_VML(tan) +IMPLEMENT_VML(tanh) +IMPLEMENT_VML(trunc) +IMPLEMENT_VML(lgamma) + + +#if AT_MKL_ENABLED() && !defined(__APPLE__) + +// NB: LP64 MKL is the most commonly used and thus we assume it here. That means +// we need to expect MKL_INT to be of type int, which implies int32_t or int64_t in most +// cases. +static_assert( + std::is_same_v || std::is_same_v, + "MKL_INT is assumed to be int32_t or int64_t"); +#define IMPLEMENT_VML_MKL_STUB(op, mklop, type, mkltype) \ + template <> \ + inline void v##op(type * out, const type * in, int64_t size) { \ + auto constexpr max_mkl_ind = std::numeric_limits::max(); \ + if (size <= static_cast(max_mkl_ind)) { \ + vm##mkltype##mklop( \ + size, in, out, VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \ + } else { \ + int64_t ind = 0; \ + int64_t chunks = size / max_mkl_ind; \ + int64_t rest = size % max_mkl_ind; \ + for (; ind < chunks; ind++) { \ + vm##mkltype##mklop( \ + max_mkl_ind, \ + in + ind * max_mkl_ind, \ + out + ind * max_mkl_ind, \ + VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \ + } \ + vm##mkltype##mklop( \ + rest, \ + in + ind * max_mkl_ind, \ + out + ind * max_mkl_ind, \ + VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \ + } \ + } + +#define IMPLEMENT_VML_MKL(op, mklop) \ + IMPLEMENT_VML_MKL_STUB(op, mklop, float, s) \ + IMPLEMENT_VML_MKL_STUB(op, mklop, double, d) + +// NB: abs, cosh and sinh were temporarily disabled due to issues with Apple +// NB: expm1 is disabled because on some configs it produces expm1(nan)=-1 +IMPLEMENT_VML_MKL(acos, Acos) +IMPLEMENT_VML_MKL(asin, Asin) +IMPLEMENT_VML_MKL(atan, Atan) +IMPLEMENT_VML_MKL(cos, Cos) +// IMPLEMENT_VML_MKL(cosh, Cosh) +IMPLEMENT_VML_MKL(erf, Erf) +IMPLEMENT_VML_MKL(erfc, Erfc) +IMPLEMENT_VML_MKL(erfinv, ErfInv) +IMPLEMENT_VML_MKL(exp, Exp) +// IMPLEMENT_VML_MKL(expm1, Expm1) +IMPLEMENT_VML_MKL(log, Ln) +IMPLEMENT_VML_MKL(log10, Log10) +IMPLEMENT_VML_MKL(sin, Sin) +// IMPLEMENT_VML_MKL(sinh, Sinh) +IMPLEMENT_VML_MKL(sqrt, Sqrt) +IMPLEMENT_VML_MKL(tan, Tan) +IMPLEMENT_VML_MKL(tanh, Tanh) +IMPLEMENT_VML_MKL(trunc, Trunc) + +// Not vectorized in MKL version tested +// IMPLEMENT_VML_MKL(abs, Abs) +// IMPLEMENT_VML_MKL(log1p, Log1p) + +#if INTEL_MKL_VERSION >= 20180406 +IMPLEMENT_VML_MKL(log2, Log2) +#endif + +#endif + +} // namespace +} // namespace at::vml + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/BLASConstants.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/BLASConstants.h new file mode 100644 index 0000000000000000000000000000000000000000..29060f6488fd51cd14b636d4e4bf53e5292723f3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/BLASConstants.h @@ -0,0 +1,16 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::cuda::detail { + +float *get_cublas_device_one(); +float *get_cublas_device_zero(); +float *get_user_alpha_ptr(); + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h new file mode 100644 index 0000000000000000000000000000000000000000..05f78c9cfff2acaf1c35bd66684a429f83f7c6ce --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h @@ -0,0 +1,76 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include + +// TODO: No need to have this whole header, we can just put it all in +// the cpp file + +namespace at::cuda::detail { + +// Set the callback to initialize Magma, which is set by +// torch_cuda_cu. This indirection is required so magma_init is called +// in the same library where Magma will be used. +TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)()); + + +// The real implementation of CUDAHooksInterface +struct CUDAHooks : public at::CUDAHooksInterface { + CUDAHooks(at::CUDAHooksArgs /*unused*/) {} + void init() const override; + Device getDeviceFromPtr(void* data) const override; + bool isPinnedPtr(const void* data) const override; + const Generator& getDefaultGenerator( + DeviceIndex device_index = -1) const override; + Generator getNewGenerator( + DeviceIndex device_index = -1) const override; + bool hasCUDA() const override; + bool hasMAGMA() const override; + bool hasCuDNN() const override; + bool hasCuSOLVER() const override; + bool hasCuBLASLt() const override; + bool hasROCM() const override; + bool hasCKSDPA() const override; + bool hasCKGEMM() const override; + const at::cuda::NVRTC& nvrtc() const override; + DeviceIndex current_device() const override; + bool isBuilt() const override {return true;} + bool isAvailable() const override {return hasCUDA();} + bool hasPrimaryContext(DeviceIndex device_index) const override; + Allocator* getCUDADeviceAllocator() const override; + Allocator* getPinnedMemoryAllocator() const override; + bool compiledWithCuDNN() const override; + bool compiledWithMIOpen() const override; + bool supportsDilatedConvolutionWithCuDNN() const override; + bool supportsDepthwiseConvolutionWithCuDNN() const override; + bool supportsBFloat16ConvolutionWithCuDNNv8() const override; + bool supportsBFloat16RNNWithCuDNN() const override; + bool hasCUDART() const override; + long versionCUDART() const override; + long versionCuDNN() const override; + long versionRuntimeCuDNN() const override; + long versionCuDNNFrontend() const override; + long versionMIOpen() const override; + std::string showConfig() const override; + double batchnormMinEpsilonCuDNN() const override; + int64_t cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const override; + void cuFFTSetPlanCacheMaxSize(DeviceIndex device_index, int64_t max_size) const override; + int64_t cuFFTGetPlanCacheSize(DeviceIndex device_index) const override; + void cuFFTClearPlanCache(DeviceIndex device_index) const override; + int getNumGPUs() const override; + DeviceIndex deviceCount() const override; + DeviceIndex getCurrentDevice() const override; + +#ifdef USE_ROCM + bool isGPUArch(const std::vector& archs, DeviceIndex device_index = -1) const override; +#endif + void deviceSynchronize(DeviceIndex device_index) const override; +}; + +} // at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h new file mode 100644 index 0000000000000000000000000000000000000000..0c5e22a6f2642c79fbbbd37495cd2195fe262738 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h @@ -0,0 +1,156 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Some stateful GPU libraries, such as cuDNN, cuBLAS, use handles to store states. +// These handles are tied to device, and these libraries requires/recommends not to +// share handles across host threads. +// +// These libraries recommend using one handle per host thread. We may not want to do +// this because threads are relatively light-weight, but creating and destroying +// handles is expensive (destroying the handle causes synchronizations). DataParallel, +// for example, creates new threads for each forward pass. +// +// This file implements a handle pool mechanism. The handle pool returns handles on +// demand as threads request them. If all existing handles in the pool are in use, +// it creates a new one. As threads terminate, they release handles back into the pool. +// In this way, the handle pool never creates more handles than the high-water mark of +// active threads, so it's efficient with DataParallel. + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace at::cuda { namespace { + +template +struct DeviceThreadHandlePool : public std::enable_shared_from_this> { + + struct Handle { + Handle_t handle; + Handle(bool create = false) : handle(nullptr) + { + if(create) Create(&handle); + } + // std::vector.emplace() and push_back() may route through temporaries and call + // copy/move constructors along the way. If this is the case, we don't want + // the destructors of temporaries to call cudnnDestroy on the handle. + // We can achieve safety (for the narrow case of stashing within std::vectors) + // by making Handle moveable but not copyable, and transferring handle ownership + // to the latest constructed object. This is not a substitute for full-blown + // reference counting, but reference counting may be overkill here. + // Another alternative is to wrap the saved Handles in unique_ptrs, i.e., + // unordered_map>> created_handles; + Handle(const Handle& rhs) = delete; + // Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom + Handle(Handle&& rhs) noexcept : Handle() { std::swap(handle, rhs.handle); } + // operator= takes argument by value + Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; } + ~Handle() { + if(handle) Destroy(handle); + } + }; + + std::mutex mutex; + + // Handles are lazily created as different threads request them, + // but are never destroyed until the end of the process. + // The maximum number of handles this process will create for each device is equal + // to the high-water mark of the number of concurrently active threads that request + // handles for that device. + // When threads terminate, they release their handles back into the pool for reuse. + // Otherwise, new handles would be created every time new threads were spawned, + // resulting in poor performance for Python modules that repeatedly or frequently + // spawned new sets of threads (like DataParallel, which creates a new set of threads + // for each forward pass). + // + // To prevent potential deadlocks, we explicitly choose not to cap the number + // of handles that are created per device. + // Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device, + // only 4 can make forward progress at any time. The other 4 will not release their + // handles until they exit, so the fifth cannot make progress until then. This is + // not a problem...UNLESS all 5 threads attempt some sort of synchronization at an + // intermediate point (ie, before any of them have exited). We have no way to anticipate + // or enforce that user threads will not attempt such intermediate synchronization. + // The only way to ensure safety is to avoid imposing a cap on the number of handles. + std::unordered_map> created_handles; + std::unordered_map> available_handles; + + // PoolWindow lazily creates and caches the handles that a particular thread is using, + // so in the common case handle access doesn't incur either handle creation or a mutex lock. + class PoolWindow + { + public: + PoolWindow(std::shared_ptr parent): weak_parent(std::move(parent)) {} + ~PoolWindow(){ release(); } + + Handle_t reserve(int device) + { + // If this thread already has a handle for this device, return it + if(my_handles.find(device) != my_handles.end()) + return my_handles[device]; + + // otherwise, either grab a handle from the pool if one is available, + // or if not, create a new one. + auto parent = weak_parent.lock(); + TORCH_CHECK(parent, "Cannot create handle during program termination"); + std::lock_guard guard(parent->mutex); + + if(parent->available_handles[device].size() > 0) + { + my_handles[device] = parent->available_handles[device].back(); + parent->available_handles[device].pop_back(); + } + else + { + // In local testing, I do observe that emplace_back sometimes routes through temporaries + // that incur move-constructor and destructor calls. See comments in Handle above. + parent->created_handles[device].emplace_back(true /*create*/); + my_handles[device] = parent->created_handles[device].back().handle; + } + + return my_handles[device]; + } + + private: + // Stores the per-device handles currently owned by this thread + std::unordered_map my_handles; + + std::weak_ptr weak_parent; + + // Called by the destructor. Releases this thread's handles back into the pool. + void release() { + if(!my_handles.empty()) { + auto parent = weak_parent.lock(); + if (!parent) { + // If this thread exits after atexit handlers have completed, the + // cuda context itself may be invalid, so we must leak the handles. + return; + } + + std::lock_guard guard(parent->mutex); + for(auto d_h : my_handles) + parent->available_handles[d_h.first].push_back(d_h.second); + } + } + }; + + // Warning: + // If you want to change this function, be aware that this function will be called + // by multiple threads and there is no mutex guarding the call of this function, so + // make sure your implementation is thread-safe. + PoolWindow *newPoolWindow() { + // The returned pointer will be owned by a thread local variable + // so that different threads does not share the same PoolWindow. + return new PoolWindow(this->shared_from_this()); + } +}; + +}} // namespace at::cuda::detail:: + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..6a09141b9b31c739fdbb50834397f9a92f1ca7f4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::cuda::detail { + +TORCH_CUDA_CU_API bool maybeOverlappingIndices(const at::TensorBase &t); +using at::native::canUse32BitIndexMath; + +template +TensorInfo +getTensorInfo(const at::TensorBase &t) { + IndexType sz[MAX_TENSORINFO_DIMS]; + IndexType st[MAX_TENSORINFO_DIMS]; + + int dims = t.dim(); + for (int i = 0; i < dims; ++i) { + sz[i] = t.size(i); + st[i] = t.stride(i); + } + + scalar* data_ptr = nullptr; + + if constexpr (std::is_const_v) { + data_ptr = t.const_data_ptr(); + } else { + data_ptr = t.mutable_data_ptr(); + } + + return TensorInfo( + data_ptr, dims, sz, st); +} + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh new file mode 100644 index 0000000000000000000000000000000000000000..432117f154c419067542cf2e3f5f51059b2068ab --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh @@ -0,0 +1,129 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +#include +#endif + +namespace at::cuda::detail { + +// A utility class to implement integer division by multiplication, given a fixed +// divisor. +// +// WARNING: The fast divider algorithm is only implemented for unsigned int; +// otherwise we default to plain integer division. For unsigned int, +// we further assume that the dividend is at most INT32_MAX. Thus, +// IntDivider must NOT be used for general integer division. +// +// This reduced range is enough for our purpose, and it allows us to +// slightly simplify the computation. +// +// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1< 0), we can find a "magic number" m (2^N +// <= m < 2^(N+1)) and shift s such that: +// +// \floor(n / d) = \floor((m * n) / 2^(N+s)). +// +// Given such m and s, the integer division can be then implemented as: +// +// let m' = m - 2^N // 0 <= m' < 2^N +// +// fast_integer_division(n): +// // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned +// // integer. Then take the higher N bits. +// t = (m' * n) >> N +// +// // Here we use the fact that n is less than 2^(N-1): otherwise the value +// // of (t + n) may not fit in an N-bit integer. +// return (t + n) >> s +// +// Finding such a magic number is surprisingly easy: +// +// s = \ceil(\log_2 d) +// m' = \floor(2^N * (2^s - d) / d) + 1 // Need 2N-bit integer arithmetic. +// +// See also: +// - Division by Invariant Integers Using Multiplication, +// Torbjörn Granlund and Peter L. Montgomery, 1994. +// +// - http://www.hackersdelight.org/magic.htm +// +// - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html + +// Result of div/mod operation stored together. +template +struct DivMod { + Value div, mod; + + C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { } +}; + +// Base case: we only have an implementation for uint32_t for now. For +// everything else, we use plain division. +template +struct IntDivider { + IntDivider() = default; + IntDivider(Value d) : divisor(d) { } + + C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; } + C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; } + C10_HOST_DEVICE inline DivMod divmod(Value n) const { + return DivMod(n / divisor, n % divisor); + } + + Value divisor; +}; + +// Implement fast integer division. +template <> +struct IntDivider { + static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int."); + + IntDivider() = default; + + IntDivider(unsigned int d) : divisor(d) { + assert(divisor >= 1 && divisor <= INT32_MAX); + + // TODO: gcc/clang has __builtin_clz() but it's not portable. + for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break; + + uint64_t one = 1; + uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1; + m1 = magic; + assert(m1 > 0 && m1 == magic); // m1 must fit in 32 bits. + } + + C10_HOST_DEVICE inline unsigned int div(unsigned int n) const { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and + // 'm1'. + unsigned int t = __umulhi(n, m1); + return (t + n) >> shift; +#else + // Using uint64_t so that the addition does not overflow. + uint64_t t = ((uint64_t) n * m1) >> 32; + return (t + n) >> shift; +#endif + } + + C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const { + return n - div(n) * divisor; + } + + C10_HOST_DEVICE inline DivMod divmod(unsigned int n) const { + unsigned int q = div(n); + return DivMod(q, n - q * divisor); + } + + unsigned int divisor; // d above. + unsigned int m1; // Magic number: m' above. + unsigned int shift; // Shift amounts. +}; + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..99562629fe531d9468fd8ec51bd98b2a492d4c35 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h @@ -0,0 +1,42 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::cuda::detail { + +// CUDA: grid stride looping +// +// int64_t _i_n_d_e_x specifically prevents overflow in the loop increment. +// If input.numel() < INT_MAX, _i_n_d_e_x < INT_MAX, except after the final +// iteration of the loop where _i_n_d_e_x += blockDim.x * gridDim.x can be +// greater than INT_MAX. But in that case _i_n_d_e_x >= n, so there are no +// further iterations and the overflowed value in i=_i_n_d_e_x is not used. +#define CUDA_KERNEL_LOOP_TYPE(i, n, index_type) \ + int64_t _i_n_d_e_x = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x; \ + for (index_type i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x) + +#define CUDA_KERNEL_LOOP(i, n) CUDA_KERNEL_LOOP_TYPE(i, n, int) + + +// Use 1024 threads per block, which requires cuda sm_2x or above +constexpr int CUDA_NUM_THREADS = 1024; + +// CUDA: number of blocks for threads. +inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) { + TORCH_INTERNAL_ASSERT(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N); + constexpr int64_t max_int = std::numeric_limits::max(); + + // Round up division for positive number that cannot cause integer overflow + auto block_num = (N - 1) / max_threads_per_block + 1; + TORCH_INTERNAL_ASSERT(block_num <= max_int, "Can't schedule too many blocks on CUDA device"); + + return static_cast(block_num); +} + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h new file mode 100644 index 0000000000000000000000000000000000000000..bab1495dda3989f4a491d3545ee23f8eec4c3773 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h @@ -0,0 +1,16 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +namespace at::cuda { +// Forward-declares at::cuda::NVRTC +struct NVRTC; + +namespace detail { +extern NVRTC lazyNVRTC; +} // namespace detail + +} // namespace at::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh new file mode 100644 index 0000000000000000000000000000000000000000..5bd215318125ff9f0d9846b2adc2e3c9cb1c2e48 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh @@ -0,0 +1,141 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +// If element_sizes is nullptr, then the strides will be in bytes, otherwise +// the strides will be in # of elements. +// Operands that share the same shape, but may have different strides. +// OffsetCalculator iterates the tensor in a column-major order + +#if defined(USE_ROCM) +constexpr int MAX_DIMS = 16; +#else +constexpr int MAX_DIMS = 25; +#endif + +template +struct OffsetCalculator { + // We allow having negative strides to implement some operations like torch.flip + using stride_t = std::conditional_t, + index_t>; + // The offset for each argument. Wrapper around fixed-size array. + // On CUDA, zero sized array is not allowed, so when we are handling nullary + // operators, we need to create a size 1 offset to avoid compiler failure. + // This size 1 offset is just a placeholder, and we will not use it. + using offset_type = std::array(NARGS, 1)>; + + // if element_sizes is nullptr, then the strides will be in bytes, otherwise + // the strides will be in # of elements. + OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes=nullptr) : dims(dims) { + TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims"); + for (int i=0; i < dims; i++){ + sizes_[i] = at::cuda::detail::IntDivider(sizes[i]); + for (int arg = 0; arg < NARGS; arg++) { + int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]); + strides_[i][arg] = strides[arg][i] / element_size; + } + } + } + + C10_HOST_DEVICE offset_type get(index_t linear_idx) const { + offset_type offsets; + +#if defined(USE_ROCM) + if ((dims > 0) && (dims <= 2)) { + auto divmod = sizes_[0].divmod(linear_idx); +#pragma unroll + for (int arg = 0; arg < NARGS; arg++) + offsets[arg] = divmod.mod * strides_[0][arg]; + if (dims >= 2) { + divmod = sizes_[1].divmod(divmod.div); +#pragma unroll + for (int arg = 0; arg < NARGS; arg++) + offsets[arg] += divmod.mod * strides_[1][arg]; + } + // [...] + return offsets; + } +#endif + + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) { + offsets[arg] = 0; + } + + #pragma unroll + for (int dim = 0; dim < MAX_DIMS; ++dim) { + if (dim == dims) { + break; + } + auto divmod = sizes_[dim].divmod(linear_idx); + linear_idx = divmod.div; + + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) { + offsets[arg] += divmod.mod * strides_[dim][arg]; + } + + } + return offsets; + } + + int dims; + at::cuda::detail::IntDivider sizes_[MAX_DIMS]; + stride_t strides_[MAX_DIMS][std::max(NARGS, 1)]; +}; + +template +struct TrivialOffsetCalculator { + // The offset for each argument. Wrapper around fixed-size array. + // The offsets are in # of elements, not in bytes. + // On CUDA, zero sized array is not allowed, so when we are handling nullary + // operators, we need to create a size 1 offset to avoid compiler failure. + // This size 1 offset is just a placeholder, and we will not use it. + using offset_type = std::array(NARGS, 1)>; + + C10_HOST_DEVICE offset_type get(index_t linear_idx) const { + offset_type offsets; + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) { + offsets[arg] = linear_idx; + } + return offsets; + } +}; + +// Make an OffsetCalculator with byte offsets +template +static OffsetCalculator make_offset_calculator(const at::TensorIteratorBase& iter) { + TORCH_INTERNAL_ASSERT(N <= iter.ntensors()); + std::array strides; + for (int i = 0; i < N; i++) { + strides[i] = iter.strides(i).data(); + } + return OffsetCalculator(iter.ndim(), iter.shape().data(), strides.data()); +} + +// Make an OffsetCalculator with element offsets +template +static OffsetCalculator make_element_offset_calculator( + const at::TensorIteratorBase& iter) { + TORCH_INTERNAL_ASSERT(N <= iter.ntensors()); + std::array strides; + std::array element_sizes; + for (int i = 0; i < N; i++) { + strides[i] = iter.strides(i).data(); + element_sizes[i] = iter.element_size(i); + } + return OffsetCalculator( + iter.ndim(), iter.shape().data(), strides.data(), element_sizes.data()); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e50519eb6a4fc842293e766f162ed26c7a028bd5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh @@ -0,0 +1,48 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// No "#pragma once" because this is a raw definition that can be copied by jit codegen. +// Eager mode clients should not include this file directly, instead, +// they should #include , which has a #pragma once. + +// Stores RNG state values. Passed as a kernel argument. +// See Note [CUDA Graph-safe RNG states]. +// +// The raw definition lives in its own file so jit codegen can easily copy it. +namespace at { + +struct PhiloxCudaState { + PhiloxCudaState() = default; + // Called if graph capture is not underway + PhiloxCudaState(uint64_t seed, + uint64_t offset) { + seed_.val = seed; + offset_.val = offset; + } + // Called if graph capture is underway + PhiloxCudaState(int64_t* seed, + int64_t* offset_extragraph, + uint64_t offset_intragraph) { + seed_.ptr = seed; + offset_.ptr = offset_extragraph; + offset_intragraph_ = offset_intragraph; + captured_ = true; + } + + // Public members, directly accessible by at::cuda::philox::unpack. + // If we made them private with getters/setters, the getters/setters + // would have to be __device__, and we can't declare __device__ in ATen. + union Payload { + uint64_t val; + int64_t* ptr; + }; + + Payload seed_{}; + Payload offset_{}; + uint64_t offset_intragraph_ = 0; + bool captured_ = false; +}; + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh new file mode 100644 index 0000000000000000000000000000000000000000..2d372718a4e786d676fff76c50da662e370be6ee --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh @@ -0,0 +1,121 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::cuda::detail { + +#define MAX_TENSORINFO_DIMS 25 + +// CUDA kernel argument that defines tensor layout +template +struct TensorInfo { + TensorInfo(); + TensorInfo(T* p, + int dim, + IndexType sz[MAX_TENSORINFO_DIMS], + IndexType st[MAX_TENSORINFO_DIMS]); + + // Set the size of the given dimension to 1, as if it were a + // reduction dim (allows you to calculate offsets of the reduction + // slice) + void reduceDim(int dim); + + // See note on [collapse dims]. + int collapseDims(const int excludeDim = -1); + + // Contiguous tensors of more than one dimension are collapsed down + // to one tensor + __host__ __device__ inline bool isContiguous() const { + return (dims == 1 && strides[0] == 1); + } + + T* data; + IndexType sizes[MAX_TENSORINFO_DIMS]; + IndexType strides[MAX_TENSORINFO_DIMS]; + int dims; +}; + +template +TensorInfo::TensorInfo() { + data = nullptr; + dims = 0; +} + +template +TensorInfo::TensorInfo(T* p, + int dim, + IndexType sz[MAX_TENSORINFO_DIMS], + IndexType st[MAX_TENSORINFO_DIMS]) { + data = p; + dims = dim; + TORCH_CHECK(dims < MAX_TENSORINFO_DIMS, "CUDA Tensors cannot have more than 25 dimensions"); + + for (int i = 0; i < dim; ++i) { + sizes[i] = sz[i]; + strides[i] = st[i]; + } +} + +template +void +TensorInfo::reduceDim(int dim) { + TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1"); + sizes[dim] = 1; +} + +template +int +TensorInfo::collapseDims(const int excludeDim) { + auto result = at::collapse_dims(sizes, strides, dims, excludeDim); + dims = std::get<1>(result); + return std::get<0>(result); +} + +// Translate a linear index for the apply to a T* offset; +// specialized on `Dims` to reduce nvcc compilation time +template +struct IndexToOffset { + static __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + // Uses static dims + for (int i = Dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +// Uses dynamic (runtime) instead of static (compile time) dims +template +struct IndexToOffset { + static inline __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + for (int i = info.dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c6f4b3941744ded405f7273f76f4bf182d008c3d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh @@ -0,0 +1,39 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// No "#pragma once" because this is a raw definition that can be copied by jit codegen. +// Eager mode clients should not include this file directly, instead, +// they should #include , which has a #pragma once. + +namespace at::cuda::philox { + +// In-kernel call to retrieve philox seed and offset from a PhiloxCudaState instance whether +// that instance was created with graph capture underway or not. +// See Note [CUDA Graph-safe RNG states]. +// +// We can't write a __device__ function in CUDAGeneratorImpl.h, because it's in ATen. +// Also, whatever call unpacks PhiloxCudaState in consumer kernels must be inlineable. +// Easiest thing that comes to mind is, define a __device__ unpack helper here, in ATen/cuda. +// +// The raw definition lives in its own file so jit codegen can easily copy it. +__host__ __device__ __forceinline__ std::tuple +unpack(at::PhiloxCudaState arg) { + if (arg.captured_) { + // static_cast avoids "warning: invalid narrowing conversion from "long" to "unsigned long". + // *(arg.offset_.ptr) is a broadcast load of a single int64_t to the entire kernel. + // For most threads' reads it will hit in cache, so it shouldn't hurt performance. + return std::make_tuple(static_cast(*arg.seed_.ptr), static_cast(*(arg.offset_.ptr) + arg.offset_intragraph_)); + } else { + return std::make_tuple(arg.seed_.val, arg.offset_.val); + } +} + +// Adapted from TE +// extract seed and offset from PhiloxCudaState +__global__ void unpack_cudnn(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr); + +void unpack_cudnn_wrapper(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr, cudaStream_t stream); + +} // namespace at::cuda::philox + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..797a857504ddcf336f0119f265c7a6d7e2e802a5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h @@ -0,0 +1,705 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif +#include +#include + +namespace at::cuda::tunable { + +using at::blas::ScalingType; + +enum class BlasOp { + N = 0, + T = 1 +}; + +inline char BlasOpToString(BlasOp op) { + switch (op) { + case BlasOp::N: + return 'N'; + case BlasOp::T: + return 'T'; + } + TORCH_CHECK(false, "unrecognized BlasOp"); + return 'N'; +} + +template +inline const char* BLASTypeName(T v) { + return "unknown"; +} + +template <> +inline const char* BLASTypeName(float v) { + return "f32_r"; +} + +template <> +inline const char* BLASTypeName(double v) { + return "f64_r"; +} + +template <> +inline const char* BLASTypeName(BFloat16 v) { + return "bf16_r"; +} + +template <> +inline const char* BLASTypeName(Half v) { + return "f16_r"; +} + +//https://github.com/ROCm/hipBLASLt/blob/develop/library/src/include/auxiliary.hpp#L175 +template <> +inline const char* BLASTypeName(Float8_e4m3fn v) { + return "f8_r"; +} + +template <> +inline const char* BLASTypeName(Float8_e5m2 v) { + return "bf8_r"; +} + +template <> +inline const char* BLASTypeName(Float8_e4m3fnuz v) { + return "f8_fnuz_r"; +} + +template <> +inline const char* BLASTypeName(Float8_e5m2fnuz v) { + return "bf8_fnuz_r"; +} + +template <> +inline const char* BLASTypeName(c10::complex v) { + return "f64_r"; +} + +template <> +inline const char* BLASTypeName(c10::complex v) { + return "f32_r"; +} + +inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) { + std::string BLASType; + switch (scalar_type) { + case c10::ScalarType::Float:{ + BLASType = "f32_r"; + break; + } + case c10::ScalarType::Double:{ + BLASType = "f64_r"; + break; + } + case c10::ScalarType::BFloat16:{ + BLASType = "bf16_r"; + break; + } + case c10::ScalarType::Half: { + BLASType = "f16_r"; + break; + } + case c10::ScalarType::Float8_e4m3fn: { + BLASType = "f8_r"; + break; + } + case c10::ScalarType::Float8_e5m2: { + BLASType = "bf8_r"; + break; + } + case c10::ScalarType::Float8_e4m3fnuz: { + BLASType = "f8_fnuz_r"; + break; + } + case c10::ScalarType::Float8_e5m2fnuz: { + BLASType = "bf8_fnuz_r"; + break; + } + case c10::ScalarType::ComplexFloat:{ + BLASType = "f32_c"; + break; + } + case c10::ScalarType::ComplexDouble:{ + BLASType = "f64_c"; + break; + } + default: + BLASType = "unknown"; + } + return BLASType; + +} + +// Similar to Compute Type in GemmRocblas.h +template +inline std::string ComputeTypeFor() { + return "Unknown ComputeType"; +} + +// This is a union of the compute types for +// ROCBLAS and hipBLASLt. +template <> +inline std::string ComputeTypeFor() { + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) { + return "f32_r"; + } else { + return "xf32_r"; + } +} + +template <> +inline std::string ComputeTypeFor() { + return "f64_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor>() { + return "f32_c"; +} + +template <> +inline std::string ComputeTypeFor>() { + return "f64_c"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +template <> +inline std::string ComputeTypeFor() { + return "f32_r"; +} + +// Convert opmath_type to string +template +inline std::string to_string_opmath(const at::opmath_type& value) { + if constexpr (std::is_same_v, c10::complex> || + std::is_same_v, c10::complex>) { + return fmt::format("({:.4f}, {:.4f})", value.real(), value.imag()); + } else { + return fmt::format("{:.4f}", value); + } +} + +// convert activation epilogue to string +inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivationEpilogue& value) { + switch (value) { + case at::cuda::blas::GEMMAndBiasActivationEpilogue::None: + return std::string("None"); + break; + case at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU: + return std::string("RELU"); + break; + case cuda::blas::GEMMAndBiasActivationEpilogue::GELU: + return std::string("GELU"); + break; + default: + return std::string("unknown"); + } +} + +namespace detail { + +static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) { + + if (!config.enabled) { + return true; // skip when disabled + } + + auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA); + at::Tensor ref = at::from_blob(c, {size}, options); + at::Tensor oth = at::from_blob(other_c, {size}, options); + at::Tensor ref_float = ref.to(at::kFloat); + at::Tensor oth_float = oth.to(at::kFloat); + + const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol); + if (ok) { + TUNABLE_LOG3("├──verify numerics: PASSED with atol=", config.atol, ", rtol=", config.rtol); + } else { + TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol); + } + return ok; +} + +} + +// Note on GetSizeA et al. +// Tensors can be dense or arbitrarily strided. We only need our copies to be large enough. +// Our copies must be at least as large as the m n k shapes dictate, but could be larger +// depending on the lda ldb ldc values. Similarly for the batched case. + +template +struct GemmParams : OpParams { + GemmParams() = default; + + std::string BLASSignature() const override { + std::string alpha_str = to_string_opmath(alpha); + std::string beta_str = to_string_opmath(beta); + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, " + "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: 1, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, bias_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, alpha_str, beta_str, transa, transb, + BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), ComputeTypeFor(), ComputeTypeFor(), ComputeTypeFor()); + } + + std::string Signature() const override { + return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, lda, ldb, ldc); + } + + size_t GetSizeA() const { + size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m); + size_t size_dense = m * k; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeB() const { + size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k); + size_t size_dense = k * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeC() const { + size_t size_stride = ldc * n; + size_t size_dense = m * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSize(bool duplicate_inputs) const { + size_t size = GetSizeC(); + if (duplicate_inputs) { + size += GetSizeA(); + size += GetSizeB(); + } + return size; + } + + GemmParams* DeepCopy(bool duplicate_inputs) const { + GemmParams* copy = new GemmParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = GetSizeC(); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + if (duplicate_inputs) { + size_t a_size = GetSizeA(); + size_t b_size = GetSizeB(); + copy->a = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(a_size)); + copy->b = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(b_size)); + copy->duplicate_inputs_ = true; + } + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + if (duplicate_inputs_) { + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(a)); + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(b)); + } + } + + TuningStatus NumericalCheck(GemmParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; + } + + char transa{}; + char transb{}; + int64_t m{}; + int64_t n{}; + int64_t k{}; + at::opmath_type alpha; + const T* a{}; + int64_t lda{}; + const T* b{}; + int64_t ldb{}; + at::opmath_type beta; + T* c{}; + int64_t ldc{}; +private: + bool duplicate_inputs_{false}; +}; + +template +struct GemmAndBiasParams : OpParams { + std::string BLASSignature() const override { + std::string alpha_str = to_string_opmath(alpha); + std::string activation_str = to_string_epilogue(activation); + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, " + "alpha: %s, transA: %c, transB: %c, batch_count: 1, a_type: %s, b_type: %s, c_type: %s, d_type: %s, activation: %s, bias_type: %s, scale_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, alpha_str, transa, transb, + BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(T{}), activation_str, BLASTypeName(T{}), ComputeTypeFor(), ComputeTypeFor(), ComputeTypeFor()); + } + + std::string Signature() const override { + return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, lda, ldb, ldc); + } + + size_t GetSizeA() const { + size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m); + size_t size_dense = m * k; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeB() const { + size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k); + size_t size_dense = k * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeC() const { + size_t size_stride = ldc * n; + size_t size_dense = m * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSize(bool duplicate_inputs) const { + size_t size = GetSizeC(); + if (duplicate_inputs) { + size += GetSizeA(); + size += GetSizeB(); + } + return size; + } + + GemmAndBiasParams* DeepCopy(bool duplicate_inputs) const { + GemmAndBiasParams* copy = new GemmAndBiasParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = GetSizeC(); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + if (duplicate_inputs) { + size_t a_size = GetSizeA(); + size_t b_size = GetSizeB(); + copy->a = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(a_size)); + copy->b = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(b_size)); + copy->duplicate_inputs_ = true; + } + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + if (duplicate_inputs_) { + // NOLINTNEXTLINE(*const-cast) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(a)); + // NOLINTNEXTLINE(*const-cast) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(b)); + } + } + + TuningStatus NumericalCheck(GemmAndBiasParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; + } + + char transa{}; + char transb{}; + int64_t m{}; + int64_t n{}; + int64_t k{}; + at::opmath_type alpha{}; + const T* a{}; + int64_t lda{}; + const T* b{}; + int64_t ldb{}; + T* c{}; + int64_t ldc{}; + const T* bias{}; + at::cuda::blas::GEMMAndBiasActivationEpilogue activation{}; +private: + bool duplicate_inputs_{false}; +}; + +template +struct GemmStridedBatchedParams : OpParams { + std::string BLASSignature() const override { + std::string alpha_str = to_string_opmath(alpha); + std::string beta_str = to_string_opmath(beta); + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: %ld, stride_b: %ld, stride_c: %ld, stride_d: %ld, " + "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: %ld, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, stride_a, stride_b, stride_c, stride_c, alpha_str, beta_str, transa, transb, batch, + BLASTypeName(T{}), BLASTypeName(T{}), BLASTypeName(C_Dtype{}), BLASTypeName(T{}), ComputeTypeFor(), ComputeTypeFor()); + } + + std::string Signature() const override { + return fmt::sprintf("%c%c_%ld_%ld_%ld_B_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, batch, lda, ldb, ldc); + } + + size_t GetSizeA() const { + size_t size_stride = stride_a * batch; + size_t size_dense = m * k * batch; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeB() const { + size_t size_stride = stride_b * batch; + size_t size_dense = k * n * batch; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeC() const { + size_t size_stride = stride_c * batch; + size_t size_dense = m * n * batch; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSize(bool duplicate_inputs) const { + size_t size = GetSizeC(); + if (duplicate_inputs) { + size += GetSizeA(); + size += GetSizeB(); + } + return size; + } + + GemmStridedBatchedParams* DeepCopy(bool duplicate_inputs) const { + GemmStridedBatchedParams* copy = new GemmStridedBatchedParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = GetSizeC(); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + if (duplicate_inputs) { + size_t a_size = GetSizeA(); + size_t b_size = GetSizeB(); + // NOLINTNEXTLINE(*const-cast*) + copy->a = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(a_size)); + // NOLINTNEXTLINE(*const-cast*) + copy->b = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(b_size)); + copy->duplicate_inputs_ = true; + } + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + if (duplicate_inputs_) { + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(a)); + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(b)); + } + } + + TuningStatus NumericalCheck(GemmStridedBatchedParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; + } + + char transa{}; + char transb{}; + int64_t m{}; + int64_t n{}; + int64_t k{}; + at::opmath_type alpha{}; + const T* a{}; + int64_t lda{}; + int64_t stride_a{}; + const T* b{}; + int64_t ldb{}; + int64_t stride_b{}; + at::opmath_type beta; + C_Dtype* c{}; + int64_t ldc{}; + int64_t stride_c{}; + int64_t batch{}; +private: + bool duplicate_inputs_{false}; +}; + +template +struct ScaledGemmParams : OpParams { + ScaledGemmParams() = default; + + std::string BLASSignature() const override { + // Excluding use_fast_accum and use_rowise booleans for now + if (bias_ptr == nullptr) { + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, " + "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, transa, transb, + ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), + ComputeTypeFor(), ComputeTypeFor()); + } + else { + return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, " + "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }", + m, n, k, lda, ldb, ldc, ldc, transa, transb, + ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype), + ComputeTypeFor(), ComputeTypeFor()); + } + } + + std::string Signature() const override { + // In Blas.cpp, code defaults to a bias_dtype of Half even when there is no bias vector. + // Search for this line:: + // params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_; + // + // In TunableOp, we must distinguish in param signature these two cases: with and without a bias vector. + return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld_rw_%d_bias_%s", + transa, transb, m, n, k, lda, ldb, ldc, + a_scaling_type == ScalingType::RowWise && b_scaling_type == ScalingType::RowWise, + bias_ptr == nullptr ? "None" : at::toString(bias_dtype)); + } + + size_t GetSizeA() const { + size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m); + size_t size_dense = m * k; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeB() const { + size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k); + size_t size_dense = k * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSizeC() const { + size_t size_stride = ldc * n; + size_t size_dense = m * n; + return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense); + } + + size_t GetSize(bool duplicate_inputs) const { + size_t size = GetSizeC(); + if (duplicate_inputs) { + size += GetSizeA(); + size += GetSizeB(); + } + return size; + } + + ScaledGemmParams* DeepCopy(bool duplicate_inputs) const { + ScaledGemmParams* copy = new ScaledGemmParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = GetSizeC(); + copy->c = c10::cuda::CUDACachingAllocator::raw_alloc(c_size); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + if (duplicate_inputs) { + size_t a_size = GetSizeA(); + size_t b_size = GetSizeB(); + copy->a = c10::cuda::CUDACachingAllocator::raw_alloc(a_size); + copy->b = c10::cuda::CUDACachingAllocator::raw_alloc(b_size); + copy->duplicate_inputs_ = true; + } + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + if (duplicate_inputs_) { + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(a)); + // NOLINTNEXTLINE(*const-cast*) + c10::cuda::CUDACachingAllocator::raw_delete(const_cast(b)); + } + } + + TuningStatus NumericalCheck(ScaledGemmParams *other) { + auto* ctx = getTuningContext(); + auto cfg = ctx->GetNumericalCheckConfig(); + return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL; + } + + char transa{}; + char transb{}; + int64_t m{}; + int64_t n{}; + int64_t k{}; + const void* a{}; + const void* a_scale_ptr{}; + int64_t lda{}; + ScalarType a_dtype{}; + ScalarType a_scale_dtype{}; + ScalingType a_scaling_type{}; + const void* b{}; + const void* b_scale_ptr{}; + int64_t ldb{}; + ScalarType b_dtype{}; + ScalarType b_scale_dtype{}; + ScalingType b_scaling_type{}; + const void* bias_ptr{}; + ScalarType bias_dtype{}; + void* c{}; + const void* c_scale_ptr{}; + int64_t ldc{}; + ScalarType c_dtype{}; + void* amax_ptr{}; + bool use_fast_accum{}; +private: + bool duplicate_inputs_{false}; +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h new file mode 100644 index 0000000000000000000000000000000000000000..13d0bf23bff74af65cd296a413da661df8f9e183 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h @@ -0,0 +1,692 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define TORCH_HIPBLASLT_CHECK(EXPR) \ + do { \ + hipblasStatus_t __err = EXPR; \ + TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS, \ + "hipblaslt error: ", \ + hipblasStatusToString(__err), \ + " when calling `" #EXPR "`"); \ + } while (0) + +namespace at::cuda::tunable { + +template +constexpr hipDataType HipDataTypeFor(); + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_32F; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_16F; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_16BF; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_64F; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_8F_E4M3_FNUZ; +} + +template <> +constexpr hipDataType HipDataTypeFor() { + return HIP_R_8F_E5M2_FNUZ; +} + +// This code is instantiated regardless of ROCm version. +// Prior to ROCm 6.3, we hard-code the known enum values. +template <> +constexpr hipDataType HipDataTypeFor() { +#if ROCM_VERSION >= 60300 + return HIP_R_8F_E4M3; +#else + return static_cast(28); +#endif +} + +template <> +constexpr hipDataType HipDataTypeFor() { +#if ROCM_VERSION >= 60300 + return HIP_R_8F_E5M2; +#else + return static_cast(29); +#endif +} + +// This type is not intended for matrix types but rather a scale factor. +// Return a dummy value to satisfy linker. +template <> +constexpr hipDataType HipDataTypeFor() { + return static_cast(500); +} + +template <> +constexpr hipDataType HipDataTypeFor() { +#if ROCM_VERSION >= 70000 + return HIP_R_4F_E2M1; +#else + return static_cast(33); +#endif +} + +template +int GetBatchFromParams(const GemmParams* params) { + return 1; +} + +template +int GetBatchFromParams(const GemmAndBiasParams* params) { + return 1; +} + +template +int GetBatchFromParams(const GemmStridedBatchedParams* params) { + return params->batch; +} + +template +int GetBatchFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +int GetStrideAFromParams(const GemmParams* params) { + return 1; +} + +template +int GetStrideAFromParams(const GemmAndBiasParams* params) { + return 1; +} + +template +int GetStrideAFromParams(const GemmStridedBatchedParams* params) { + return params->stride_a; +} + +template +int GetStrideAFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +int GetStrideBFromParams(const GemmParams* params) { + return 1; +} + +template +int GetStrideBFromParams(const GemmAndBiasParams* params) { + return 1; +} + +template +int GetStrideBFromParams(const GemmStridedBatchedParams* params) { + return params->stride_b; +} + +template +int GetStrideBFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +int GetStrideCFromParams(const GemmParams* params) { + return 1; +} + +template +int GetStrideCFromParams(const GemmAndBiasParams* params) { + return 1; +} + +template +int GetStrideCFromParams(const GemmStridedBatchedParams* params) { + return params->stride_c; +} + +template +int GetStrideCFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +float GetAlphaFromParams(const GemmParams* params) { + return params->alpha; +} + +template +float GetAlphaFromParams(const GemmAndBiasParams* params) { + return params->alpha; +} + +template +float GetAlphaFromParams(const GemmStridedBatchedParams* params) { + return params->alpha; +} + +template +float GetAlphaFromParams(const ScaledGemmParams* params) { + return 1.0; +} + +template +float GetBetaFromParams(const GemmParams* params) { + return params->beta; +} + +template +float GetBetaFromParams(const GemmAndBiasParams* params) { + return 0.0; +} + +template +float GetBetaFromParams(const GemmStridedBatchedParams* params) { + return params->beta; +} + +template +float GetBetaFromParams(const ScaledGemmParams* params) { + return 0.0; +} + +template +ScalingType GetAScalingTypeFromParams(const GemmParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetBScalingTypeFromParams(const GemmParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetAScalingTypeFromParams(const GemmAndBiasParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetBScalingTypeFromParams(const GemmAndBiasParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetAScalingTypeFromParams(const GemmStridedBatchedParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetBScalingTypeFromParams(const GemmStridedBatchedParams* params) { + return ScalingType::TensorWise; +} + +template +ScalingType GetAScalingTypeFromParams(const ScaledGemmParams* params) { + return params->a_scaling_type; +} + +template +ScalingType GetBScalingTypeFromParams(const ScaledGemmParams* params) { + return params->b_scaling_type; +} + +template +const void* GetAScalePointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetAScalePointerFromParams(const GemmAndBiasParams* params) { + return nullptr; +} + +template +const void* GetAScalePointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetAScalePointerFromParams(const ScaledGemmParams* params) { + return params->a_scale_ptr; +} + +template +const void* GetBScalePointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetBScalePointerFromParams(const GemmAndBiasParams* params) { + return nullptr; +} + +template +const void* GetBScalePointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetBScalePointerFromParams(const ScaledGemmParams* params) { + return params->b_scale_ptr; +} + +template +const void* GetDScalePointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetDScalePointerFromParams(const GemmAndBiasParams* params) { + return nullptr; +} + +template +const void* GetDScalePointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetDScalePointerFromParams(const ScaledGemmParams* params) { + return params->c_scale_ptr; +} + +template +const void* GetBiasPointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetBiasPointerFromParams(const GemmAndBiasParams* params) { + return params->bias; +} + +template +const void* GetBiasPointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetBiasPointerFromParams(const ScaledGemmParams* params) { + return params->bias_ptr; +} + +template +hipDataType GetBiasTypeFromParams(const GemmParams* params) { + return HIP_R_32F; +} + +template +hipDataType GetBiasTypeFromParams(const GemmAndBiasParams* params) { + return HipDataTypeFor(); +} + +template +hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams* params) { + return HIP_R_32F; +} + +template +hipDataType GetBiasTypeFromParams(const ScaledGemmParams* params) { + return at::cuda::ScalarTypeToCudaDataType(params->bias_dtype); +} + +template +at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmParams* params) { + return at::cuda::blas::GEMMAndBiasActivationEpilogue::None; +} + +template +at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmAndBiasParams* params) { + return params->activation; +} + +template +at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmStridedBatchedParams* params) { + return at::cuda::blas::GEMMAndBiasActivationEpilogue::None; +} + +template +at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const ScaledGemmParams* params) { + return at::cuda::blas::GEMMAndBiasActivationEpilogue::None; +} + +static hipblasOperation_t _hipblasOpFromChar(char op) { + switch (op) { + case 'n': + case 'N': + return HIPBLAS_OP_N; + case 't': + case 'T': + return HIPBLAS_OP_T; + case 'c': + case 'C': + return HIPBLAS_OP_C; + } + TORCH_CHECK(false, + "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); +} + +static char _charFromhipblasOp(hipblasOperation_t op) { + switch (op) { + case HIPBLAS_OP_N: + return 'N'; + case HIPBLAS_OP_T: + return 'T'; + case HIPBLAS_OP_C: + return 'C'; + } + TORCH_CHECK(false, + "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`"); +} + +static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) { + if (layout == BlasOp::N) { + return HIPBLAS_OP_N; + } + return HIPBLAS_OP_T; +} + +template +struct HipBlasLtDeleter { + void operator()(T* x) { + if (x != nullptr) { + TORCH_CUDABLAS_CHECK(destructor(x)); + } + } +}; + +template +class HipBlasLtDescriptor { + public: + T* descriptor() const { + return descriptor_.get(); + } + T* descriptor() { + return descriptor_.get(); + } + + protected: + std::unique_ptr> descriptor_; +}; + +class HipBlasLtMatmulDescriptor : public HipBlasLtDescriptor< + hipblasLtMatmulDescOpaque_t, + &hipblasLtMatmulDescDestroy> { + public: + HipBlasLtMatmulDescriptor( + hipblasComputeType_t compute_type, + hipDataType scale_type) { + hipblasLtMatmulDesc_t raw_descriptor = nullptr; + TORCH_HIPBLASLT_CHECK( + hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type)); + descriptor_.reset(raw_descriptor); + } + template + inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) { + TORCH_HIPBLASLT_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T))); + } +}; + +template +class HipblasltGemmOp : public Callable { + public: + HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {} + + TuningStatus Call(const ParamsT* params) override { + hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout); + hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout); + auto a_datatype = HipDataTypeFor(); + auto b_datatype = HipDataTypeFor(); + auto in_out_datatype = HipDataTypeFor(); + auto opa = _hipblasOpFromChar(params->transa); + auto opb = _hipblasOpFromChar(params->transb); + + TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen"); + + float alpha = GetAlphaFromParams(params); + float beta = GetBetaFromParams(params); + + hipblasLtMatrixLayout_t mat_a, mat_b, mat_c; + if (opa == HIPBLAS_OP_N) { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->m, params->k, params->lda)); + } + else { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->k, params->m, params->lda)); + } + if (opb == HIPBLAS_OP_N) { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->k, params->n, params->ldb)); + } + else { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->n, params->k, params->ldb)); + } + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc)); + + // specific to batched gemmm + int batch = GetBatchFromParams(params); + if (batch > 1) { + int64_t stride_a = GetStrideAFromParams(params); + int64_t stride_b = GetStrideBFromParams(params); + int64_t stride_c = GetStrideCFromParams(params); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c))); + } + + hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) { + computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; + } + HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb); + + // specific to scaled gemm + const void* mat1_scale_ptr = GetAScalePointerFromParams(params); + const void* mat2_scale_ptr = GetBScalePointerFromParams(params); + const void* result_scale_ptr = GetDScalePointerFromParams(params); + if (mat1_scale_ptr && mat2_scale_ptr) { + hipblasLtMatmulDescAttributes_t a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER; + hipblasLtMatmulDescAttributes_t b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER; + if (GetAScalingTypeFromParams(params) == ScalingType::RowWise) { +#if defined(HIPBLASLT_OUTER_VEC) + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F); +#elif defined(HIPBLASLT_VEC_EXT) + a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT; +#endif + } + if (GetBScalingTypeFromParams(params) == ScalingType::RowWise) { +#if defined(HIPBLASLT_OUTER_VEC) + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F); +#elif defined(HIPBLASLT_VEC_EXT) + b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT; +#endif + } + matmul.setAttribute(a_scale_ptr_desc, mat1_scale_ptr); + matmul.setAttribute(b_scale_ptr_desc, mat2_scale_ptr); + } + if (result_scale_ptr) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr); + } + + const void* bias_ptr = GetBiasPointerFromParams(params); + auto bias_datatype = GetBiasTypeFromParams(params); + if (bias_ptr) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype); + auto activation = GetActivationFromParams(params); + if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_RELU_BIAS); + } + else if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::GELU) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_GELU_BIAS); + } + else { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS); + } + } + + size_t workspace_size = at::cuda::getCUDABlasLtWorkspaceSize(); + + auto op_handle = at::cuda::getCurrentCUDABlasLtHandle(); + + size_t ret_workspace_size = 0; + auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle, + matmul.descriptor(), + &alpha, + mat_a, + mat_b, + &beta, + mat_c, + mat_c, + algo_, + ret_workspace_size); + + if (status == HIPBLAS_STATUS_SUCCESS) { + if (ret_workspace_size >= workspace_size) { + return FAIL; + } + } + else { + return FAIL; + } + + void* workspace_buffer = at::cuda::getCUDABlasLtWorkspace(); + + TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle, + matmul.descriptor(), + &alpha, + params->a, + mat_a, + params->b, + mat_b, + &beta, + params->c, + mat_c, + params->c, + mat_c, + &algo_, + workspace_buffer, + workspace_size, + at::cuda::getCurrentCUDAStream())); + + //TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c)); + return OK; + } + + private: + hipblasLtMatmulAlgo_t algo_; +}; + +template +auto GetHipBlasLtTypeStringAndOps() { + hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout); + hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout); + auto a_datatype = HipDataTypeFor(); + auto b_datatype = HipDataTypeFor(); + auto in_out_datatype = HipDataTypeFor(); + std::vector heuristic_result; +#if ROCM_VERSION == 60400 + // hipblaslt TT fp32 regression on ROCm 6.4, cannot use + if ((a_datatype == HIP_R_32F || b_datatype == HIP_R_32F || in_out_datatype == HIP_R_32F) + && (transa_outer == HIPBLAS_OP_T && transb_outer == HIPBLAS_OP_T)) { + std::vector>>> ignore; + return ignore; + } +#endif + + hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F; + if (at::globalContext().allowTF32CuBLAS()) { + computeType = HIPBLAS_COMPUTE_32F_FAST_TF32; + } + + hipblasLtHandle_t handle; + TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle)); + TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle, + hipblaslt_ext::GemmType::HIPBLASLT_GEMM, + transa_outer, + transb_outer, + a_datatype, + b_datatype, + in_out_datatype, + in_out_datatype, + computeType, + heuristic_result)); + TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle)); + + int returned_algo_count = heuristic_result.size(); + std::vector>>> ret; + for (int i = 0; i < returned_algo_count; i++) { + auto algo = heuristic_result[i].algo; + int algo_index = hipblaslt_ext::getIndexFromAlgo(algo); + auto callable = std::make_unique>(algo); + std::string type_string = fmt::sprintf("Gemm_Hipblaslt_%d", algo_index); + ret.emplace_back(type_string, std::move(callable)); + } + + return ret; +} + +template +auto GetHipBlasLtGemmTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +template +auto GetHipBlasLtGemmAndBiasTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +template +auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +template +auto GetHipBlasLtScaledGemmTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +#undef TORCH_HIPBLASLT_CHECK + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h new file mode 100644 index 0000000000000000000000000000000000000000..8734d42b01a9a8603532f3284b14904471543a2e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h @@ -0,0 +1,282 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include + +#define ROCBLAS_BETA_FEATURES_API +#include + +#define TORCH_ROCBLAS_CHECK(EXPR) \ + do { \ + rocblas_status __err = EXPR; \ + TORCH_CHECK(__err == rocblas_status_success, \ + "rocblas error: ", \ + rocblas_status_to_string(__err), \ + " when calling `" #EXPR "`"); \ + } while (0) + +namespace at::cuda::tunable { + +template +constexpr rocblas_datatype RocBlasDataTypeFor(); + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_f64_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_f16_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_bf16_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor>() { + return rocblas_datatype_f32_c; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor>() { + return rocblas_datatype_f64_c; +} + +template +constexpr rocblas_datatype RocBlasComputeTypeFor(); + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + return rocblas_datatype_f64_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + // Note that we're returning the _compute_ type for a given datatype. + // As of 12/2022, using compute type FP16 for 16-bit floats was much + // slower than using compute type FP32. So we use FP32 compute even for + // FP16 datatypes. This is how GEMM is implemented even in the function + // rocblasGemmHelper (see fpgeneric.h) + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + // Note that we're returning the _compute_ type for a given datatype. + // As of 12/2022, using compute type FP16 for 16-bit floats was much + // slower than using compute type FP32. So we use FP32 compute even for + // BF16 datatypes. This is how GEMM is implemented even in the function + // rocblasGemmHelper (see fpgeneric.h) + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor>() { + return rocblas_datatype_f32_c; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor>() { + return rocblas_datatype_f64_c; +} + +template +auto DoCastForHalfOrBfloat16(const T fp) { + return fp; +} + +template <> +inline auto DoCastForHalfOrBfloat16(const Half fp) { + // alpha and beta should be the same as compute_type, in Half case it is float. + float h = fp; + return h; +} + +template <> +inline auto DoCastForHalfOrBfloat16(const BFloat16 fp) { + // alpha and beta should be the same as compute_type, in bfloat16 case it is float. + float h = fp; + return h; +} + +static rocblas_operation _rocblasOpFromChar(char op) { + switch (op) { + case 'n': + case 'N': + return rocblas_operation_none; + case 't': + case 'T': + return rocblas_operation_transpose; + case 'c': + case 'C': + return rocblas_operation_conjugate_transpose; + } + TORCH_CHECK(false, + "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); +} + +template +class RocblasGemmOp : public Callable> { + public: + RocblasGemmOp(int solution) : solution_{solution} {} + + TuningStatus Call(const GemmParams* params) override { + auto input_output_type = RocBlasDataTypeFor(); + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) + return FAIL; // no support for TF32 in rocBLAS + auto compute_type = RocBlasComputeTypeFor(); + auto h_a = DoCastForHalfOrBfloat16(params->alpha); + auto h_b = DoCastForHalfOrBfloat16(params->beta); + auto status = rocblas_gemm_ex( + (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(), + _rocblasOpFromChar(params->transa), + _rocblasOpFromChar(params->transb), + params->m, params->n, params->k, + &h_a, + params->a, input_output_type, params->lda, + params->b, input_output_type, params->ldb, + &h_b, + params->c, input_output_type, params->ldc, + params->c, input_output_type, params->ldc, + compute_type, + rocblas_gemm_algo_solution_index, + solution_, + rocblas_gemm_flags_none); + if (status != rocblas_status_success) { + return FAIL; + } + return OK; + } + + private: + int solution_; +}; + +template +auto GetRocBlasGemmTypeStringAndOps() { + rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(); + int solution_size; + auto input_output_type = RocBlasDataTypeFor(); + auto compute_type = RocBlasComputeTypeFor(); + // Get the number of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + nullptr, + &solution_size)); + std::vector solutions(solution_size); + // Get the list of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + solutions.data(), + &solution_size)); + std::vector>>>> ret; + for (size_t i = 0; i < solutions.size(); ++i) { + auto callable = std::make_unique>(solutions[i]); + ret.emplace_back(std::make_pair(fmt::sprintf("Gemm_Rocblas_%d", solutions[i]), std::move(callable))); + } + return ret; +} + +template +class RocblasGemmStridedBatchedOp : public Callable> { + public: + RocblasGemmStridedBatchedOp(int solution) : solution_{solution} {} + + TuningStatus Call(const GemmStridedBatchedParams* params) override { + auto input_output_type = RocBlasDataTypeFor(); + if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r) + return FAIL; // no support for TF32 in rocBLAS + auto compute_type = RocBlasComputeTypeFor(); + auto h_a = DoCastForHalfOrBfloat16(params->alpha); + auto h_b = DoCastForHalfOrBfloat16(params->beta); + auto status = rocblas_gemm_strided_batched_ex( + (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(), + _rocblasOpFromChar(params->transa), + _rocblasOpFromChar(params->transb), + params->m, params->n, params->k, + &h_a, + params->a, input_output_type, params->lda, params->stride_a, + params->b, input_output_type, params->ldb, params->stride_b, + &h_b, + params->c, input_output_type, params->ldc, params->stride_c, + params->c, input_output_type, params->ldc, params->stride_c, + params->batch, + compute_type, + rocblas_gemm_algo_solution_index, + solution_, + rocblas_gemm_flags_none); + if (status != rocblas_status_success) { + return FAIL; + } + return OK; + } + + private: + int solution_; +}; + +template +auto GetRocBlasGemmStridedBatchedTypeStringAndOps() { + rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(); + int solution_size; + auto input_output_type = RocBlasDataTypeFor(); + auto compute_type = RocBlasComputeTypeFor(); + // Get the number of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + nullptr, + &solution_size)); + std::vector solutions(solution_size); + // Get the list of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + solutions.data(), + &solution_size)); + // Sort the solutions in ascending order to make the solution vector deterministic across runs + std::sort(solutions.begin(), solutions.end()); + + std::vector>>>> ret; + for (size_t i = 0; i < solutions.size(); ++i) { + auto callable = std::make_unique>(solutions[i]); + ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable))); + } + return ret; +} + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h new file mode 100644 index 0000000000000000000000000000000000000000..14f1f089ad4fc04b28c6c1c1d36fe64056725fd9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h @@ -0,0 +1,55 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include + +#include + +namespace at::cuda::tunable { + +class StreamTimer : public ITimer { + public: + StreamTimer(); + ~StreamTimer() override; + + void Start() override; + + void End() override; + + float Duration() override; + + private: + cudaEvent_t start_{}; + cudaEvent_t end_{}; +}; + +class StreamTimerNoSync : public ITimer { + public: + StreamTimerNoSync(); + ~StreamTimerNoSync() override; + + void Start() override; + + void End() override; + + float Duration() override; + + private: + cudaEvent_t start_{}; + cudaEvent_t end_{}; +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h new file mode 100644 index 0000000000000000000000000000000000000000..c055f6e72989c3c6e66a35671d10839f3bb354c8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h @@ -0,0 +1,270 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TUNABLE_LOGV(LEVEL, ...) getTuningContext()->Log(LEVEL, __VA_ARGS__) +#define TUNABLE_LOG1(...) TUNABLE_LOGV(1, __VA_ARGS__) +#define TUNABLE_LOG2(...) TUNABLE_LOGV(2, __VA_ARGS__) +#define TUNABLE_LOG3(...) TUNABLE_LOGV(3, __VA_ARGS__) + +namespace at::cuda::tunable { + +enum TORCH_CUDA_CPP_API TuningStatus { + OK = 0, + FAIL = 1, + UNSUPPORTED = 2, +}; + +// Mapping from params signature to kernel id +class TORCH_CUDA_CPP_API ResultEntry { + public: + explicit ResultEntry(std::string key, double time) : key_(std::move(key)), time_(time) {} + explicit ResultEntry(std::string key, double time, std::string blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(std::move(blas_sig)) {} + bool operator==(const ResultEntry& other) const { return key_ == other.key_; } + bool operator!=(const ResultEntry& other) const { return key_ != other.key_; } + operator std::string () { return key_; } + std::string GetKey() const { return key_; } + double GetTime() const { return time_; } + friend std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry); + static ResultEntry Null() { return ResultEntry("Null", 0.0); } + static ResultEntry Default() { return ResultEntry("Default", 0.0); } + + private: + std::string key_; + double time_; + std::string blas_sig_; +}; + +typedef std::unordered_map KernelMap; +typedef std::unordered_map ResultsMap; +typedef std::unordered_map> UntunedMap; + +struct TORCH_CUDA_CPP_API TuningResults { + // Validates if these results are compatible with the libraries + std::unordered_map validators; + + // Mapping from Callable signature to Callable's tuning result + ResultsMap results; +}; + +class TORCH_CUDA_CPP_API TuningResultsManager { + public: + TuningResultsManager() = default; + ~TuningResultsManager() = default; + + KernelMap Lookup(const std::string& op_signature); + + ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature); + + void AddImpl(const std::string& op_signature, + const std::string& params_signature, + ResultEntry best, + KernelMap& kernel_map); + + void Add(const std::string& op_signature, + const std::string& params_signature, + ResultEntry best); + + void Delete(const std::string& op_signature, const std::string& params_signature); + + void DisjointMergeImpl( + const std::string& op_signature, + const KernelMap& kernel_map, + /*out*/ ResultsMap& results); + + void Load(const ResultsMap& results_to_load); + + ResultsMap Dump(); + + void DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map); + + size_t GetSize(); + + void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature, + const std::string& params_signature, const std::string& blas_signature); + + void InitRealtimeAppend( + const std::string& filename, + const std::unordered_map& validators); + + void AppendResultLine(const std::string& op_sig, + const std::string& param_sig, + const ResultEntry& result); + + void CloseRealtimeAppend(); // For clean shutdown + private: + std::mutex lock_; + std::mutex realtime_file_mutex_; + std::unique_ptr realtime_out_; + std::string realtime_filename_; + ResultsMap results_; + UntunedMap untuned_results_; + bool validators_written_ = false; + +}; + +class TORCH_CUDA_CPP_API TuningResultsValidator { + public: + using GetFunc = std::function; + using ValidateFunc = std::function; + using GetValidateFuncs = std::unordered_map>; + + TuningResultsValidator(); + ~TuningResultsValidator() = default; + + std::unordered_map GetAllValidators() const; + TuningStatus ValidateAll(const std::unordered_map& to_validate) const; + void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf); + + protected: + static std::string GetPyTorchVersion() ; + TuningStatus ValidatePyTorchVersion(const std::string& value) const; + + public: + static constexpr const std::array mandatory_keys{"PT_VERSION"}; + + private: + GetValidateFuncs validators_; +}; + +struct NumericalCheckConfig { + bool enabled{false}; + double atol{1e-5}; + double rtol{1e-5}; + + NumericalCheckConfig() = default; + NumericalCheckConfig(bool e, double a, double r) : enabled(e), atol(a), rtol(r) {} +}; + + +class TORCH_CUDA_CPP_API TuningContext { + public: + TuningContext(); + ~TuningContext(); + TuningContext(TuningContext &) = delete; + TuningContext(TuningContext &&) = delete; + TuningContext &operator=(TuningContext &) = delete; + TuningContext &operator=(TuningContext &&) = delete; + + void EnableTunableOp(bool value); + bool IsTunableOpEnabled() const; + + void EnableTuning(bool value); + bool IsTuningEnabled() const; + + void EnableRecordUntuned(bool value); + bool IsRecordUntunedEnabled() const; + std::ofstream& GetUntunedFile(); + + void EnableNumericsCheck(bool value); + bool IsNumericsCheckEnabled() const; + void SetNumericalCheckConfig(bool enabled, double atol, double rtol); + NumericalCheckConfig GetNumericalCheckConfig() const; + + void SetMaxTuningDurationMs(int max_duration_ms); + int GetMaxTuningDurationMs() const; + + void SetMaxTuningIterations(int max_iter); + int GetMaxTuningIterations() const; + + void SetMaxWarmupDurationMs(int max_duration_ms); + int GetMaxWarmupDurationMs() const; + + void SetMaxWarmupIterations(int max_iter); + int GetMaxWarmupIterations() const; + + void EnableICacheFlush(bool value); + bool IsICacheFlushEnabled() const; + + void SetRotatingBufferSize(int size); + int GetRotatingBufferSize() const; + + TuningResultsManager& GetTuningResultsManager(); + + TuningResultsValidator& GetTuningResultsValidator(); + + TuningResults GetTuningResults(); + + TuningStatus LoadTuningResults(const TuningResults& tr); + + void SetFilename(const std::string& filename, bool insert_device_ordinal=false); + std::string GetFilename() const; + + bool ReadFile(const std::string& filename={}); + + template + void Log(int level, Types... args) { + if (GetLogOkay() && GetLogLevel() >= level) { + GetLog() << c10::str(args...) << std::endl; + } + } + + private: + std::string GetLogFilename() const; + int GetLogLevel() const; + bool GetLogOkay() const; + std::ostream& GetLog() const; + + bool enable_; + bool tuning_enable_; + bool record_untuned_enable_; + bool manager_initialized_; + bool numerics_check_enable_; + int max_tuning_duration_ms_; + int max_tuning_iterations_; + int max_warmup_duration_ms_; + int max_warmup_iterations_; + bool icache_flush_; + int rotating_buffer_size_; + mutable TuningResultsManager manager_; + mutable c10::once_flag manager_init_once_; + TuningResultsValidator validator_; + std::string filename_; + std::ofstream untuned_file_; + size_t results_count_from_input_file_; + bool is_shutting_down_; + + NumericalCheckConfig numerics_cfg_{}; +}; + +TORCH_CUDA_CPP_API TuningContext* getTuningContext(); + +class ITimer { + public: + ITimer() = default; + virtual ~ITimer() = default; + + virtual void Start() = 0; + virtual void End() = 0; + + /// Computes the elapsed time in milliseconds between Start() and End() + virtual float Duration() = 0; +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h new file mode 100644 index 0000000000000000000000000000000000000000..b377374967ee2f224983c993145eb427d2cc57bd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h @@ -0,0 +1,334 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#ifdef USE_ROCM +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::cuda::tunable { + +template +class DefaultGemmOp : public Callable> { + public: + TuningStatus Call(const GemmParams* params) override { + at::cuda::blas::gemm_internal( + params->transa, params->transb, + params->m, params->n, params->k, + params->alpha, + params->a, params->lda, + params->b, params->ldb, + params->beta, + params->c, params->ldc); + return OK; + } +}; + +static bool _transposeBoolFromChar(char op) { + return op == 't' || op == 'T'; +} + +template +class DefaultGemmAndBiasOp : public Callable> { + public: + TuningStatus Call(const GemmAndBiasParams* params) override { + at::cuda::blas::gemm_and_bias( + _transposeBoolFromChar(params->transa), + _transposeBoolFromChar(params->transb), + params->m, params->n, params->k, + params->alpha, + params->a, params->lda, + params->b, params->ldb, + params->bias, + params->c, params->ldc, + params->activation); + return OK; + } +}; + +template +class DefaultGemmStridedBatchedOp : public Callable> { + public: + TuningStatus Call(const GemmStridedBatchedParams* params) override { + at::cuda::blas::bgemm_internal( + params->transa, params->transb, + params->m, params->n, params->k, + params->alpha, + params->a, params->lda, params->stride_a, + params->b, params->ldb, params->stride_b, + params->beta, + params->c, params->ldc, params->stride_c, + params->batch); + return OK; + } +}; + +template +class DefaultScaledGemmOp : public Callable> { + public: + TuningStatus Call(const ScaledGemmParams* params) override { + at::cuda::blas::scaled_gemm( + params->transa, + params->transb, + params->m, + params->n, + params->k, + params->a, + params->a_scale_ptr, + params->lda, + params->a_dtype, + params->a_scale_dtype, + params->a_scaling_type, + params->b, + params->b_scale_ptr, + params->ldb, + params->b_dtype, + params->b_scale_dtype, + params->b_scaling_type, + params->bias_ptr, + params->bias_dtype, + params->c, + params->c_scale_ptr, + params->ldc, + params->c_dtype, + params->use_fast_accum, + std::nullopt /* alpha */); + return OK; + } +}; + +template +inline bool IsZero(T v) { + return v == 0.0f; +} + +template <> +inline bool IsZero(BFloat16 v) { + return v.x == 0; +} + +template <> +inline bool IsZero(Half v) { + return float(v) == 0.0f; +} + +template <> +inline bool IsZero(c10::complex v) { + return v == 0.0; +} + +template <> +inline bool IsZero(c10::complex v) { + return v == 0.0f; +} + +template +inline const char* TypeName(T v) { + return "unknown"; +} + +template <> +inline const char* TypeName(float v) { + if (at::globalContext().allowTF32CuBLAS()) { + return "tf32"; + } else { + return "float"; + } +} + +template <> +inline const char* TypeName(double v) { + return "double"; +} + +template <> +inline const char* TypeName(BFloat16 v) { + return "BFloat16"; +} + +template <> +inline const char* TypeName(Half v) { + return "Half"; +} + +template <> +inline const char* TypeName(Float8_e4m3fn v) { + return "Float8_e4m3fn"; +} + +template <> +inline const char* TypeName(Float8_e5m2 v) { + return "Float8_e5m2"; +} + +template <> +inline const char* TypeName(Float8_e4m3fnuz v) { + return "Float8_e4m3fnuz"; +} + +template <> +inline const char* TypeName(Float8_e5m2fnuz v) { + return "Float8_e5m2fnuz"; +} + +template <> +inline const char* TypeName(Float8_e8m0fnu v) { + return "Float8_e8m0fnu"; +} + +template <> +inline const char* TypeName(c10::complex v) { + return "c10::complex"; +} + +template <> +inline const char* TypeName(c10::complex v) { + return "c10::complex"; +} + +template +class GemmTunableOp : public TunableOp> { + public: + GemmTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + +#ifdef USE_ROCM + static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED"); + if (!env_rocblas.has_value() || env_rocblas.value()) { + for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + + static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (!env_hipblaslt.has_value() || env_hipblaslt.value()) { + // disallow tuning of hipblaslt with c10::complex + if constexpr ( + !std::is_same_v> && + !std::is_same_v>) { + for (auto&& [name, op] : GetHipBlasLtGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + } +#endif + + this->RegisterOp(std::string("Default"), std::make_unique>()); + } + + std::string Signature() override { + return fmt::sprintf("GemmTunableOp_%s_%c%c", TypeName(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +template +class GemmAndBiasTunableOp : public TunableOp> { + public: + GemmAndBiasTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + +#ifdef USE_ROCM + static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (!env_hipblaslt.has_value() || env_hipblaslt.value()) { + // disallow tuning of hipblaslt with c10::complex + if constexpr ( + !std::is_same_v> && + !std::is_same_v>) { + for (auto&& [name, op] : GetHipBlasLtGemmAndBiasTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + } +#endif + + this->RegisterOp(std::string("Default"), std::make_unique>()); + } + + std::string Signature() override { + return fmt::sprintf("GemmAndBiasTunableOp_%s_%c%c", TypeName(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +template +class GemmStridedBatchedTunableOp : public TunableOp> { + public: + GemmStridedBatchedTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + +#ifdef USE_ROCM + static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED"); + if (!env_rocblas.has_value() || env_rocblas.value()) { + for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + + static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (!env_hipblaslt.has_value() || env_hipblaslt.value()) { + // disallow tuning of hipblaslt with c10::complex + if constexpr ( + !std::is_same_v> && + !std::is_same_v>) { + for (auto&& [name, op] : GetHipBlasLtGemmStridedBatchedTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + } +#endif + + this->RegisterOp(std::string("Default"), std::make_unique>()); + } + + std::string Signature() override { + return fmt::sprintf("GemmStridedBatchedTunableOp_%s_%c%c", TypeName(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +template +class ScaledGemmTunableOp : public TunableOp> { + public: + ScaledGemmTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + +#ifdef USE_ROCM + for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } +#endif + + this->RegisterOp(std::string("Default"), std::make_unique>()); + } + + std::string Signature() override { + return fmt::sprintf("ScaledGemmTunableOp_%s_%s_%s_%c%c", + TypeName(AT{}), + TypeName(BT{}), + TypeName(CT{}), + BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h new file mode 100644 index 0000000000000000000000000000000000000000..1a59c1aebc7f01340384c0bbc3cbdc1c3299a6dc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h @@ -0,0 +1,434 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#include +#include +#include + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include + +namespace at::cuda::tunable { + +template +class Callable { + public: + virtual ~Callable() = default; + virtual TuningStatus Call(const ParamsT* /*unused*/) { + return FAIL; + } + virtual TuningStatus IsSupported(const ParamsT* params) { + return Call(params); + } +}; + +namespace { + +/** http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance */ + +class Stats { + public: + Stats() { + _n = 0UL; + _mean = 0.0; + _M2 = 0.0; + _sum = 0.0; + _min = 0.0; + _max = 0.0; + } + + void sample_value(const double x) { + double delta = 0; + _sum = _sum + x; + if (0UL == _n) { + _min = x; + _max = x; + } + else { + _min = _min < x ? _min : x; + _max = _max > x ? _max : x; + } + _n = _n + 1UL; + delta = x - _mean; + _mean = _mean + delta/_n; + _M2 = _M2 + delta * (x - _mean); + } + + double variance() const { + return _M2/(_n-1); + } + + double stddev() const { + return std::sqrt(variance()); + } + + unsigned long _n; + double _mean; + double _M2; + double _sum; + double _min; + double _max; +}; + +class FixedSizeStack { + private: + std::deque stack; + const size_t max_size; + + public: + FixedSizeStack(size_t size) : max_size(size) {} + + void push(const std::string& value) { + if (stack.size() >= max_size) { + stack.pop_front(); // Remove the oldest entry + } + stack.push_back(value); // Add new entry + } + + auto rbegin() { return stack.rbegin(); } + auto rend() { return stack.rend(); } +}; + +} // anonymous namespace + +template +class TunableOp { + public: + virtual ~TunableOp() = default; + + TuningStatus operator()(const ParamsT* params) { + ResultEntry result = ResultEntry::Null(); + TuningContext* ctx = getTuningContext(); + if (ctx->IsTunableOpEnabled()) { + auto& mgr = ctx->GetTuningResultsManager(); + auto op_sig = Signature(); + auto params_sig = params->Signature(); + auto blas_sig = params->BLASSignature(); + result = mgr.Lookup(op_sig, params_sig); + // If there is not previous tuning result been found, we do the tuning iff tuning is enabled + if (result == ResultEntry::Null()) { + if (ctx->IsTuningEnabled()) { + result = FindFastest(params); + mgr.Add(op_sig, params_sig, result); + } + else if (ctx->IsRecordUntunedEnabled()) { + // or record the gemm into file + mgr.RecordUntuned(ctx->GetUntunedFile(), op_sig, params_sig, blas_sig); + } + } + } + else { + result = ResultEntry::Default(); + } + if (result == ResultEntry::Null()) { + TUNABLE_LOG2("no result, using default"); + result = ResultEntry::Default(); + } + auto iter = ops_.find(result); + TORCH_CHECK(iter != ops_.end()); + return iter->second->Call(params); + } + + virtual std::string Signature() { + // According to C++17 standard https://wg21.link/n4659 section 15.7.4 + // > if the operand of typeid refers to the + // > object under construction or destruction, typeid yields the std::type_info object representing the constructor + // > or destructor’s class. + // So delay the op signature generation. + c10::call_once(signature_init_once_, [this]() { signature_ = CreateSignature(); }); + return signature_; + } + + protected: + void RegisterOp(const std::string& name, std::unique_ptr> op) { + this->op_names_.emplace_back(name); + this->ops_.emplace(name, std::move(op)); + } + + private: + static void WarmUp(Callable *op, const std::vector ¶m, size_t num_iter, size_t &offset) { + TuningContext* ctx = getTuningContext(); + bool do_flush = ctx->IsICacheFlushEnabled(); + for (size_t i = 0; i < num_iter; i++) { + if (do_flush) { + at::cuda::flush_icache(); + } + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + } + } + + static double ProfileSimple(Callable *op, const std::vector ¶m, size_t num_iter, size_t &offset) { + TuningContext* ctx = getTuningContext(); + bool do_flush = ctx->IsICacheFlushEnabled(); + StreamTimerNoSync timer{}; + + // Small Mandatory Warmup + // Reduces outliers + for (size_t i = 0; i < 2; i++) { + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + } + + timer.Start(); + for (size_t i = 0; i < num_iter; i++) { + if (do_flush) { + at::cuda::flush_icache(); + } + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + } + timer.End(); + return timer.Duration() / num_iter; + } + + static Stats ProfileStats(Callable *op, const std::vector ¶m, size_t num_iter, size_t &offset) { + TuningContext* ctx = getTuningContext(); + bool do_flush = ctx->IsICacheFlushEnabled(); + std::vector timer(num_iter); + + // Small Mandatory Warmup + // Reduces outliers + for (size_t i = 0; i < 2; i++) { + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + } + + for (size_t i = 0; i < num_iter; i++) { + timer[i].Start(); + TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK); + timer[i].End(); + if (do_flush) { + at::cuda::flush_icache(); + } + } + Stats s; + for (size_t i = 0; i < num_iter; i++) { + s.sample_value(timer[i].Duration()); + } + return s; + } + + protected: + virtual ResultEntry FindFastest(const ParamsT* params) { + TuningContext* ctx = getTuningContext(); + auto op_sig = Signature(); + auto params_sig = params->Signature(); + auto blas_sig = params->BLASSignature(); + TUNABLE_LOG2("finding fastest for ", op_sig, '(', params_sig, ')', " out of ", op_names_.size(), " candidates"); + auto min_duration_ms = std::numeric_limits::infinity(); + std::string id_name = "Default"; + ParamsT* reference_params = nullptr; + auto top_solns = FixedSizeStack(5); + + // numeric check option is controlled by non-static env var, so check it once per tuned operator + bool do_numerics_check = ctx->IsNumericsCheckEnabled(); + + // calculate a reference answer for numerical check + if (do_numerics_check) { + reference_params = params->DeepCopy(false); + TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK); + } + + // need copies of params to reuse + // make as many copies as will fill the requested rotating buffer size, if requested + // rotating_size guaranteed to be >= 0 even though GetRotatingBufferSize() returns int + size_t rotating_size = ctx->GetRotatingBufferSize(); + bool use_buffer_rotation = (rotating_size > 0); + size_t param_size = params->GetSize(use_buffer_rotation); + size_t param_count = (rotating_size / param_size) + 1; + constexpr size_t MB = 1024ull*1024; + if (use_buffer_rotation) { + TUNABLE_LOG2("Rotating buffer ", rotating_size/MB, " MiB. ", + "Needed Size: ", param_size/MB, " MiB. ", + "Needed number of param copies: ", param_count); + } + TORCH_CHECK(param_count > 0); + + std::vector reusable_params(param_count); + for (size_t i = 0; i < param_count; i++) { + reusable_params[i] = params->DeepCopy(use_buffer_rotation); + } + + // for rotating buffer + size_t offset = 0; + + for (size_t i = 0; i < op_names_.size(); i++) { + auto* candidate = ops_[op_names_[i]].get(); // borrow pointer + + auto status = candidate->Call(reusable_params[0]); + if (status != OK) { + TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + + // collect a small profile + int approx_num_iter = 3; + auto s = ProfileStats(candidate, reusable_params, approx_num_iter, offset); + double approx_duration = s._mean; + // bail if too slow + if (approx_duration > 1.5 * min_duration_ms) { + TUNABLE_LOG3("├──skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + + // 2nd phase skip, more aggressive + approx_num_iter = 10; + s = ProfileStats(candidate, reusable_params, approx_num_iter, offset); + approx_duration = s._mean; + // bail if too slow + if (approx_duration > 1.15 * min_duration_ms) { + TUNABLE_LOG3("├──2nd skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + + if (do_numerics_check) { + ParamsT* numerical_params = params->DeepCopy(false); + auto status = candidate->Call(numerical_params); + if (status != OK) { + numerical_params->Delete(); + TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + status = reference_params->NumericalCheck(numerical_params); + numerical_params->Delete(); + if (status != OK) { + TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + } + + // for warmup does user set max duration, max iters, or both? + // warmup is skipped by default, i.e. warmup_iter = 0 + // warmup will be set to the non-zero value of max_warmup_duration + // or max_warmup_iter + // if both are non-zero, we take the smaller of the two. + double max_warmup_duration = ctx->GetMaxWarmupDurationMs(); + int max_warmup_iter = ctx->GetMaxWarmupIterations(); + int warmup_iter = 0; // default + if (max_warmup_duration > 0) { + int duration_iters = max_warmup_duration / approx_duration; + if (max_warmup_iter > 0) { + warmup_iter = std::min(max_warmup_iter, duration_iters); + } + else { + warmup_iter = duration_iters; + } + } + else if (max_warmup_iter > 0) { + warmup_iter = max_warmup_iter; + } + + // for tuning does user set max duration, max iters, or both? + double max_tuning_duration = ctx->GetMaxTuningDurationMs(); + int max_tuning_iter = ctx->GetMaxTuningIterations(); + int tuning_iter = 100; // default + if (max_tuning_duration > 0) { + int duration_iters = max_tuning_duration / approx_duration; + if (max_tuning_iter > 0) { + tuning_iter = std::min(max_tuning_iter, duration_iters); + } + else { + tuning_iter = duration_iters; + } + } + else if (max_tuning_iter > 0) { + tuning_iter = max_tuning_iter; + } + // tuning must run at least 1 iteration + tuning_iter = std::max(1, tuning_iter); + + // do the full warmup followed by tuning + double warmup_ms = warmup_iter * approx_duration; + double tuning_ms = tuning_iter * approx_duration; + TUNABLE_LOG3("├──tuning using " + "warmup iters ", warmup_iter, " [", warmup_ms, " ms] " + "and tuning iters ", tuning_iter, " [", tuning_ms, " ms] ", + "instance id=", i, ", ", op_sig, "(", params_sig, ") ", op_names_[i]); + TUNABLE_LOG3("├──offset at ", offset); + WarmUp(candidate, reusable_params, warmup_iter, offset); + s = ProfileStats(candidate, reusable_params, tuning_iter, offset); + auto s_stddev = s.stddev(); + // Assume normal distribution. + // Solution with smallest mean + 2*sigma will be a better solution? + // if ((s._mean + 2*s_stddev) < (min_duration_ms + 2*min_stddev_ms)) { + if (s._mean < min_duration_ms) { + TUNABLE_LOG3("├──found better instance id=", i, ". " , s._mean, "ms. ", op_names_[i], + " min ", s._min, + " max ", s._max, + " mean ", s._mean, + " std ", s_stddev); + min_duration_ms = s._mean; + id_name = op_names_[i]; + std::string current_soln = std::to_string(s._mean) + " " + op_names_[i]; + top_solns.push(current_soln); + } + else { + TUNABLE_LOG3("├──found slower instance id=", i, ". " , s._mean, "ms. ", op_names_[i], + " min ", s._min, + " max ", s._max, + " mean ", s._mean, + " std ", s_stddev); + } + } + + for (size_t i = 0; i < reusable_params.size(); i++) { + reusable_params[i]->Delete(); + } + if (reference_params) { + reference_params->Delete(); + } + + TUNABLE_LOG2("└──found fastest for ", op_sig, '(', params_sig, ") ", id_name); + TUNABLE_LOG2("└──top five solutions for ", op_sig, '(', params_sig, ") "); + for (auto it = top_solns.rbegin(); it != top_solns.rend(); ++it) { + TUNABLE_LOG2(" ", *it); + } + return ResultEntry(id_name, min_duration_ms, blas_sig); + } + + private: + std::string CreateSignature() { +#ifndef _WIN32 + const auto* name = typeid(*this).name(); + // NOLINTNEXTLINE(*array*) + char buf[256]; + size_t buf_len = 256; + abi::__cxa_demangle(name, buf, &buf_len, nullptr); + buf[255] = '\0'; + return buf; +#else + return typeid(*this).name(); +#endif + } + + mutable c10::once_flag signature_init_once_; + std::string signature_; + + std::unordered_map>> ops_; + std::vector op_names_; +}; + +struct OpParams { + virtual ~OpParams() = default; + virtual std::string Signature() const = 0; + virtual std::string BLASSignature() const = 0; +}; + +} // namespace at::cuda::tunable + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/ADInterpreters.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/ADInterpreters.h new file mode 100644 index 0000000000000000000000000000000000000000..6412a44107cffbc1c6793dd7f14a1c711875a358 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/ADInterpreters.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::functorch { + +// These are the interpreters for our AD transforms +// (grad, vjp and jvp). +// See NOTE: [functorch interpreter stack] for more details. + +struct TORCH_API GradInterpreterPtr { + explicit GradInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Grad); } + TransformType key() const { return base_->key(); } + int64_t level() const { return base_->level(); } + void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack); + void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case); + bool prevGradMode() const { + return std::get(base_->meta()).prevGradMode_; + } + Tensor lift(const Tensor& tensor) const; + private: + const Interpreter* base_; +}; + +struct TORCH_API JvpInterpreterPtr { + explicit JvpInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Jvp); } + TransformType key() const { return base_->key(); } + int64_t level() const { return base_->level(); } + void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack); + void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case); + bool prevFwdGradMode() const { + return std::get(base_->meta()).prevFwdGradMode_; + } + Tensor lift(const Tensor& tensor) const; + private: + const Interpreter* base_; +}; + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchRulesHelper.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchRulesHelper.h new file mode 100644 index 0000000000000000000000000000000000000000..9fd81d3273ce57027528b65071917fa80f4729a0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchRulesHelper.h @@ -0,0 +1,486 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +#pragma once + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// This file contains helper functions for batching rules. + +namespace at::functorch { + +TORCH_API Tensor reshape_dim_into(int64_t src, int64_t dst, const Tensor& x); +TORCH_API Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x); + +TORCH_API Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x); + +Tensor moveBatchDimToFront(Tensor tensor, std::optional maybe_batch_dim); +int64_t rankWithoutBatchDim(const Tensor& tensor, std::optional maybe_batch_dim); +int64_t numelWithoutBatchDim(const Tensor& tensor, std::optional maybe_batch_dim); +std::optional valIfNonempty(std::optional maybe_empty, int64_t new_val); +int64_t getPhysicalDim(const Tensor& tensor, bool has_batch_dim, int64_t logical_dim); +VmapDimVector getPhysicalDims(const Tensor& tensor, bool has_batch_dim, IntArrayRef logical_dims); + +void vmapIncompatibleInplaceError(const char* schema_name); + +Tensor maybePadToLogicalRank(const Tensor& tensor, std::optional has_bdim, int64_t logical_rank); + +void check_randomness(RandomnessType randomness); +void check_randomness(RandomnessType randomness, bool any_tensor_bdim); + +inline Tensor ensure_has_bdim(const Tensor& tensor, bool has_bdim, c10::SymInt batch_size) { + if (has_bdim) { + return tensor; + } + const auto sizes = tensor.sym_sizes(); + SymDimVector expanded_shape; + expanded_shape.reserve(sizes.size()); + expanded_shape.emplace_back(std::move(batch_size)); + expanded_shape.insert(expanded_shape.end(), sizes.begin(), sizes.end()); + return tensor.expand_symint(expanded_shape); +} + +#define VMAP_SUPPORT(op, batch_rule) \ + m.impl(#op, op ## _generated_plumbing); + +#define VMAP_SUPPORT2(op, overload, batch_rule) \ + m.impl(#op "." #overload, op ## _ ## overload ## _generated_plumbing); + +#define OP_DECOMPOSE(op) m.impl(#op, static_cast(native::op)); +#define OP_DECOMPOSE2(op, overload) m.impl(#op"."#overload, static_cast(native::op)); + +// DO NOT USE ME DIRECTLY! Use BASIC_UNARY_BATCH_RULE to save yourself some pain +template +struct BasicUnaryBatchRuleHelper; + +template +struct BasicUnaryBatchRuleHelper> { + static std::tuple> apply( + const Tensor& tensor, + std::optional batch_dim, + T... extra_args) { + return std::make_tuple(Func(tensor, std::forward(extra_args)...), batch_dim); + } +}; + +// USAGE: BASIC_UNARY_BATCH_RULE(at::sin) +// INCORRECT USAGE: BASIC_UNARY_BATCH_RULE(&at::sin) +// It is important that this macro is not passed a function pointer!! +#define BASIC_UNARY_BATCH_RULE(fn) SINGLE_ARG(\ + BasicUnaryBatchRuleHelper<\ + decltype(&fn),\ + &fn,\ + c10::guts::function_traits::parameter_types>::apply) + +#define UNARY_POINTWISE(op) \ + VMAP_SUPPORT(op, BASIC_UNARY_BATCH_RULE(ATEN_FN(op))); + +template +struct VariadicBdimsBatchRuleHelper; + +template +struct VariadicBdimsBatchRuleHelper> { + static std::tuple> apply( + const Tensor& tensor, + std::optional batch_dim, + T... extra_args) { + auto tensor_ = moveBatchDimToFront(tensor, batch_dim); + return std::make_tuple(Func(tensor_, std::forward(extra_args)...), 0); + } +}; + +// USAGE: VARIADIC_BDIMS_BATCH_RULE(at::cholesky_inverse) +// INCORRECT USAGE: VARIADIC_BDIMS_BATCH_RULE(&at::cholesky_inverse) +// It is important that this macro is not passed a function pointer!! +#define VARIADIC_BDIMS_BATCH_RULE(fn) SINGLE_ARG(\ + VariadicBdimsBatchRuleHelper<\ + decltype(&fn),\ + &fn,\ + c10::guts::function_traits::parameter_types>::apply) + +#define VARIADIC_BDIMS(op) \ + VMAP_SUPPORT(op, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN(op))); + +#define VARIADIC_BDIMS2(op, overload) \ + VMAP_SUPPORT2(op, overload, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN2(op, overload))); + +template +void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack* stack) { + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + + c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); + auto maybe_layer = maybeCurrentDynamicLayer(); + vmap_check_escaped(maybe_layer, "boxed_tensor_inputs_batch_rule"); + + int64_t cur_level = maybe_layer->layerId(); + + auto orig_arguments = torch::jit::last(*stack, num_arguments); + if (std::none_of(orig_arguments.begin(), orig_arguments.end(), ivalueParticipatesInCurrentLevel)) { + op.callBoxed(stack); + return; + } + + auto arguments = torch::jit::pop(*stack, num_arguments); + std::vector>> tensor_inputs; + std::vector tensor_pos; + for (const auto idx : c10::irange(0, num_arguments)) { + const auto& ivalue = arguments[idx]; + if (ivalue.isTensor()) { + auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(ivalue.toTensor(), cur_level); + tensor_inputs.emplace_back(std::move(tensor_value), tensor_bdim); + tensor_pos.push_back(static_cast(idx)); + } + } + Func(tensor_inputs); + + size_t tensor_idx = 0; + TORCH_INTERNAL_ASSERT(!tensor_pos.empty()); + for (const auto arg_idx : c10::irange(0, num_arguments)) { + if (tensor_idx >= tensor_pos.size() || (int64_t)arg_idx != tensor_pos[tensor_idx]) { + torch::jit::push(stack, arguments[arg_idx]); + } else { + TORCH_INTERNAL_ASSERT(tensor_idx < tensor_inputs.size()); + torch::jit::push(stack, tensor_inputs[tensor_idx].first); + tensor_idx++; + } + } + + op.callBoxed(stack); + const auto returns = torch::jit::pop(*stack, num_returns); + for (const auto& ret : returns) { + if (ret.isTensor()) { + torch::jit::push(stack, makeBatched(ret.toTensor(), 0, cur_level)); + } else { + TORCH_INTERNAL_ASSERT(false, "This boxed batching rule does not currently support ops that return non-tensor values"); + } + } +} + +inline void handle_pointwise_ops(std::vector>> &tensor_inputs) { + int64_t out_logical_rank = 0; + for (auto& tensor_input : tensor_inputs) { + int64_t cur_logical_rank = rankWithoutBatchDim(tensor_input.first, tensor_input.second); + out_logical_rank = std::max(out_logical_rank, cur_logical_rank); + } + for (auto& tensor_input: tensor_inputs) { + tensor_input.first = moveBatchDimToFront(tensor_input.first, tensor_input.second); + tensor_input.first = maybePadToLogicalRank(tensor_input.first, tensor_input.second, out_logical_rank); + } +} + +#define POINTWISE_BOXED(op) \ + m.impl(#op, torch::CppFunction::makeFromBoxedFunction>()); + +#define POINTWISE_BOXED2(op, overload) \ + m.impl(#op "." #overload, torch::CppFunction::makeFromBoxedFunction>()); + +inline void handle_variadic_bdims(std::vector>> &tensor_inputs) { + for (auto & tensor_input : tensor_inputs) { + tensor_input.first = moveBatchDimToFront(tensor_input.first, tensor_input.second); + } +} + +#define VARIADIC_BDIMS_BOXED(op) \ + m.impl(#op, torch::CppFunction::makeFromBoxedFunction>()); + +using UnpackedBatchedTensor = std::tuple>; + +inline void find_and_unpack_tensors( + const torch::jit::Stack* stack, + int64_t num_args, + int64_t cur_level, + SmallVector* tensors, + SmallVector* tensors_pos, + int64_t* batch_size) { + + int64_t computed_batch_size = -1; + int64_t args_begin = static_cast(stack->size()) - num_args; + + for (const auto idx : c10::irange(0, num_args)) { + const auto& ivalue = (*stack)[args_begin + idx]; + if (!ivalue.isTensor()) { + continue; + } + auto unpacked = unwrapTensorAtLevel(ivalue.toTensor(), cur_level); + const auto& [tensor_value, tensor_bdim] = unpacked; + if (tensor_bdim.has_value()) { + auto candidate_batch_size = tensor_value.size(*tensor_bdim); + if (computed_batch_size == -1) { + computed_batch_size = candidate_batch_size; + } + TORCH_INTERNAL_ASSERT(candidate_batch_size == computed_batch_size); + } + + tensors->push_back(std::move(unpacked)); + tensors_pos->push_back(idx); + } + TORCH_INTERNAL_ASSERT(computed_batch_size > -1); + *batch_size = computed_batch_size; +} + +inline void boxed_existing_bdim_all_batch_rule( + const c10::OperatorHandle& op, torch::jit::Stack* stack) { + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = static_cast(schema.arguments().size()); + + c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); + const auto maybe_layer = maybeCurrentDynamicLayer(); + vmap_check_escaped(maybe_layer, "boxed_existing_bdim_all_batch_rule"); + + const auto arguments = torch::jit::last(stack, num_arguments); + if (std::none_of(arguments.begin(), arguments.end(), ivalueParticipatesInCurrentLevel)) { + op.callBoxed(stack); + return; + } + + int64_t args_begin = static_cast(stack->size()) - num_arguments; + SmallVector tensor_inputs; + SmallVector tensor_pos; + int64_t batch_size = 0; + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + int64_t cur_level = maybe_layer->layerId(); + + find_and_unpack_tensors( + stack, num_arguments, cur_level, + &tensor_inputs, &tensor_pos, &batch_size); + + // for each tensor, ensure it has a bdim and reshape it. + for (const auto tensor_idx : c10::irange(0, tensor_inputs.size())) { + const auto& [value, bdim] = tensor_inputs[tensor_idx]; + auto value_ = ensure_has_bdim(value, bdim.has_value(), batch_size); + (*stack)[args_begin + tensor_pos[tensor_idx]] = reshape_dim_into(bdim.value_or(0), 0, value_); + } + + op.callBoxed(stack); + + for (const auto idx : c10::irange(args_begin, args_begin + num_returns)) { + const auto& ret = (*stack)[idx]; + TORCH_INTERNAL_ASSERT(ret.isTensor(), + "This boxed batching rule does not currently support ops that return non-tensor values"); + (*stack)[idx] = makeBatched(reshape_dim_outof(0, batch_size, ret.toTensor()), 0, cur_level); + } +} + +// Use when all tensors arguments accept one (normal) batch dim. +// This batching rule expands the batch dim on all Tensors, reshapes it into +// dim 0, calls the op, and then reshapes the batch dim out of dim 0. +// This is not the most efficient thing; if there are alternatives, please try +// to use them. Use this only as a last resort. +#define EXISTING_BDIM_ALL_BOXED(op) \ + m.impl(#op, torch::CppFunction::makeFromBoxedFunction()); + +template +inline void boxed_all_tensors_have_optional_bdim( + const c10::OperatorHandle& op, torch::jit::Stack* stack) { + const auto& schema = op.schema(); + const auto num_returns = schema.returns().size(); + const auto num_arguments = schema.arguments().size(); + + c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); + auto maybe_layer = maybeCurrentDynamicLayer(); + vmap_check_escaped(maybe_layer, "boxed_all_tensors_have_optional_bdim"); + int64_t cur_level = maybe_layer->layerId(); + + const auto arguments = torch::jit::last(stack, num_arguments); + if (std::none_of(arguments.begin(), arguments.end(), ivalueParticipatesInCurrentLevel)) { + op.callBoxed(stack); + return; + } + + int64_t args_begin = static_cast(stack->size() - num_arguments); + SmallVector tensor_inputs; + SmallVector tensor_pos; + int64_t batch_size = 0; + + find_and_unpack_tensors( + stack, static_cast(num_arguments), cur_level, + &tensor_inputs, &tensor_pos, &batch_size); + + std::optional is_no_batch_dim_case; + + for (const auto tensor_idx : c10::irange(0, tensor_inputs.size())) { + const auto& value = std::get<0>(tensor_inputs[tensor_idx]); + auto bdim = std::get<1>(tensor_inputs[tensor_idx]); + const auto logical_rank = rankWithoutBatchDim(value, bdim); + + if (!is_no_batch_dim_case.has_value()) { + is_no_batch_dim_case = (logical_rank == feature_rank); + } + auto value_ = ensure_has_bdim(value, bdim.has_value(), batch_size); + if (!bdim.has_value()) { + bdim = 0; + } + if (*is_no_batch_dim_case) { + TORCH_INTERNAL_ASSERT(logical_rank == feature_rank); + value_ = moveBatchDimToFront(value_, bdim); + if (tensor_idx == contig_tensor_index) { + value_ = value_.contiguous(); + } + (*stack)[args_begin + tensor_pos[tensor_idx]] = std::move(value_); + continue; + } + TORCH_INTERNAL_ASSERT(logical_rank == feature_rank + 1); + value_ = reshape_dim_into(*bdim, 0, value_); + if (tensor_idx == contig_tensor_index) { + value_ = value_.contiguous(); + } + (*stack)[args_begin + tensor_pos[tensor_idx]] = std::move(value_); + } + + op.callBoxed(stack); + + for (const auto idx : c10::irange(args_begin, args_begin + num_returns)) { + const auto& ret = (*stack)[idx]; + TORCH_INTERNAL_ASSERT(ret.isTensor(), + "This boxed batching rule does not currently support ops that return non-tensor values"); + if (*is_no_batch_dim_case) { + (*stack)[idx] = makeBatched(ret.toTensor(), 0, cur_level); + } else { + (*stack)[idx] = makeBatched(reshape_dim_outof(0, batch_size, ret.toTensor()), 0, cur_level); + } + } +} + +// Useful for many NN operators. +// The operator must satisfy the following: +// - All arguments must accept an optional batch dim. +// - All arguments must be the same rank +#define ALL_TENSORS_HAVE_OPTIONAL_BDIM_BOXED(feature_rank, op) \ + m.impl(#op, torch::CppFunction::makeFromBoxedFunction>()); + +#define ALL_TENSORS_HAVE_OPTIONAL_BDIM_BOXED_CONTIG1(feature_rank, op, contig_tensor_index) \ + m.impl(#op, \ + torch::CppFunction::makeFromBoxedFunction<\ + boxed_all_tensors_have_optional_bdim<\ + feature_rank, \ + contig_tensor_index>\ + >()); + +template +struct ExistingBdimBatchRuleHelper; + +template +struct ExistingBdimBatchRuleHelper> { + static std::tuple> apply( + const Tensor& self, + std::optional self_bdim, + T... extra_args) { + auto self_ = reshape_dim_into(*self_bdim, 0, self); + auto out = Func(self_, std::forward(extra_args)...); + return std::make_tuple(reshape_dim_outof_symint(0, self.sym_sizes()[*self_bdim], out), 0); + } +}; + +// USAGE: EXISTING_BDIM_BATCH_RULE(at::cholesky_inverse) +// INCORRECT USAGE: EXISTING_BDIM_BATCH_RULE(&at::cholesky_inverse) +// It is important that this macro is not passed a function pointer!! +#define EXISTING_BDIM_BATCH_RULE(fn) SINGLE_ARG(\ + ExistingBdimBatchRuleHelper<\ + decltype(&fn),\ + &fn,\ + c10::guts::function_traits::parameter_types>::apply) + + +#define EXISTING_BDIM(op) \ + VMAP_SUPPORT(op, EXISTING_BDIM_BATCH_RULE(ATEN_FN(op))); + +#define EXISTING_BDIM2(op, overload) \ + VMAP_SUPPORT2(op, overload, EXISTING_BDIM_BATCH_RULE(ATEN_FN2(op, overload))); + +#define INVOKE(object,ptrToMember) ((object).*(ptrToMember)) + + +template +Tensor& unary_inplace_batch_rule(Tensor& self, std::optional /*unused*/, ExtraArgs... extra_args) { + INVOKE(self, Method)(std::forward(extra_args)...); + return self; +} + +inline int64_t get_bdim_size4( + const Tensor& a_value, std::optional a_bdim, + const Tensor& b_value, std::optional b_bdim, + const Tensor& c_value, std::optional c_bdim, + const Tensor& d_value, std::optional d_bdim) { + if (a_bdim) + return a_value.size(*a_bdim); + if (b_bdim) + return b_value.size(*b_bdim); + if (c_bdim) + return c_value.size(*c_bdim); + if (d_bdim) + return d_value.size(*d_bdim); + TORCH_INTERNAL_ASSERT(false); +} + +inline int64_t get_bdim_size3( + const Tensor& a_value, std::optional a_bdim, + const Tensor& b_value, std::optional b_bdim, + const Tensor& c_value, std::optional c_bdim) { + if (a_bdim) + return a_value.size(*a_bdim); + if (b_bdim) + return b_value.size(*b_bdim); + if (c_bdim) + return c_value.size(*c_bdim); + TORCH_INTERNAL_ASSERT(false); +} + +inline int64_t get_bdim_size2( + const Tensor& a_value, std::optional a_bdim, + const Tensor& b_value, std::optional b_bdim) { + if (a_bdim) + return a_value.size(*a_bdim); + if (b_bdim) + return b_value.size(*b_bdim); + TORCH_INTERNAL_ASSERT(false); +} + +inline c10::SymInt get_bdim_size2_symint( + const Tensor& a_value, std::optional a_bdim, + const Tensor& b_value, std::optional b_bdim) { + if (a_bdim) + return a_value.sym_size(*a_bdim); + if (b_bdim) + return b_value.sym_size(*b_bdim); + TORCH_INTERNAL_ASSERT(false); +} + +// [start, start + 1, ..., stop - 1] +inline VmapDimVector range(int64_t start, int64_t stop) { + TORCH_INTERNAL_ASSERT(stop >= start); + VmapDimVector dims; + dims.reserve(stop - start); + for (int64_t i = start; i < stop; i++) { + dims.emplace_back(i); + } + return dims; +} +std::tuple _binary_pointwise_helper( + const Tensor& tensor, std::optional tensor_batch_dim, const Tensor& other, std::optional other_batch_dim, + bool do_type_promotion=true); + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedFallback.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedFallback.h new file mode 100644 index 0000000000000000000000000000000000000000..befe73ec33e2006a69c5ebe5ce741d5f04d2114f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedFallback.h @@ -0,0 +1,86 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include +#include +#include + +namespace at::functorch { + +// This file contains code for the vmap fallback (also known as the +// BatchedTensor fallback or the Batched fallback). This code runs +// when an operation doesn't have a batching rule implemented. + +// If an operator doesn't have a batching rule implemented then we fallback +// to this implementation. The fallback doesn't work on out= variants or +// view operations; that is, it works for out-of-place operations and +// in-place non-view operations. +// +// For out-of-place operations, the fallback effectively takes all of the +// BatchedTensors in `stack`, slices them, and runs `op` on all of the +// corresponding slices to produce slices of the outputs. The output slices +// then get `torch.stack`ed to create the +// final returns. +// +// The performance of the fallback is not very good because it introduces an +// extra copy from stacking the sliced outputs. Because of this, we prefer to +// write batching rules for operators whenever possible. +void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack); +void batchedNestedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack); + +void vmapErrorFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack); + +// The vmap fallback emits a warning by default, but it may be disabled if +// the user finds it to be too annoying. +TORCH_API bool isVmapFallbackWarningEnabled(); +TORCH_API void setVmapFallbackWarningEnabled(bool enabled); + +// Used for testing. The vmap fallback is enabled by default. When it is disabled, +// it raises an error. +TORCH_API bool isVmapFallbackEnabled(); +TORCH_API void setVmapFallbackEnabled(bool enabled); + +template A vector_to_result(const std::vector& buffer) { + return buffer[0].to(); +} +template std::tuple vector_to_result(const std::vector& buffer) { + return std::make_tuple(buffer[0].to(), buffer[1].to()); +} +template std::tuple vector_to_result(const std::vector& buffer) { + return std::make_tuple(buffer[0].to(), buffer[1].to(), buffer[2].to()); +} + +// slow_fallback is a way to call the vmap fallback inside some boxed kernel. +// There is probably some better way to metaprogram this. +template +Ret slow_fallback(const c10::OperatorHandle& op, ArrayRef args) { + std::vector stack(args.begin(), args.end()); + batchedTensorForLoopFallback(op, &stack); + return vector_to_result(stack); +} + +template +std::tuple slow_fallback(const c10::OperatorHandle& op, ArrayRef args) { + std::vector stack(args.begin(), args.end()); + batchedTensorForLoopFallback(op, &stack); + return vector_to_result(stack); +} + +template +std::tuple slow_fallback(const c10::OperatorHandle& op, ArrayRef args) { + std::vector stack(args.begin(), args.end()); + batchedTensorForLoopFallback(op, &stack); + return vector_to_result(stack); +} + + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..1424c85bd62b78ec0a8bc3c4bb190db50a4dc200 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h @@ -0,0 +1,181 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include + +#include +#include +#include + +namespace at::functorch { + +using Tensor = at::Tensor; + +// We assume this in a few other places in the codebase, +// but there isn't a centralized definition. +constexpr int64_t kVmapMaxTensorDims = 64; + +// The valid vmap levels range from [0, 64). This effectively means that we +// support a maximum of 64 nested vmaps. +constexpr int64_t kVmapNumLevels = 64; + +// Store this number of elements of BatchDims on the stack. Most people will +// probably use <= 5 nested vmaps, but adjust this number as necessary. +constexpr int64_t kBatchDimsStackSize = 5; + +// A BatchedTensorImpl holds an underlying Tensor and a single batch dim +// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a +// BatchedTensorImpl. +// +// The batch dimensions are treated as being "private"; they are not user-visible. +// For example, in the following Tensor, +// bt = BatchedTensorImpl(ones(2, 3, 5, 7), lvl=1, dim=0) +// dimension 0 is batch dimension. +// +// bt.sizes() returns (5, 7); bt.sum(0) performs a reduction over the (public) +// dim 0, which is equivalent to dim 3 in the underlying ones(2, 3, 5, 7) tensor. +struct TORCH_API BatchedTensorImpl : public c10::TensorImpl { + explicit BatchedTensorImpl(at::DispatchKeySet key_set, Tensor value, int64_t dim, int64_t level); + + // Returns batch dimension of this tensor + int64_t bdim() const { return bdim_; } + + // Returns batch dimension of this tensor + int64_t level() const { return level_; } + + // BatchedTensorImpl wraps a Tensor + const Tensor& value() const { return value_; } + + // Given a public dimension index, return the dimension index in the underlying + // value() tensor. + // For example, if we have + // bt = BatchedTensorImpl(ones(2, 3, 5, 7), lvl=1, dim=0) + // bt.actualDim(0) -> 1 + // bt.actualDim(1) -> 2 + // bt.actualDim(2) -> 3 + // bt.actualDim(3) -> Error + int64_t actualDim(int64_t dim, bool wrap_dim = true) const; + + IntArrayRef sizes_custom() const override; + SymIntArrayRef sym_sizes_custom() const override; + int64_t size_custom(int64_t d) const override; + c10::SymInt sym_size_custom(int64_t d) const override; + // We have to override this because we opted into CustomStrides + IntArrayRef strides_custom() const override; + SymIntArrayRef sym_strides_custom() const override; + // Override a bunch of methods inherited from TensorImpl to return error messages. + c10::SymBool sym_is_contiguous_custom(at::MemoryFormat memory_format) const override; + void set_size(int64_t dim, int64_t new_size) override; + void set_stride(int64_t dim, int64_t new_stride) override; + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override; + c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const override; + void shallow_copy_from(const c10::intrusive_ptr& impl) override; +#ifdef DEBUG + bool has_storage() const override; +#endif + + void refreshTensorMetadata(); + + // Used in torchdim. torchdim uses non-lexical BatchedTensor; the way it + // accomplishes this is a hack where it is able to modify the levels of + // BatchedTensor to match the level of the current vmap transform. + void _unsafe_set_level(int64_t level) { + level_ = level; + } + + // Used in batching rule for in-place view operations that can change + // the index of the bdim (think squeeze_, unsqueeze_) + void unsafe_set_bdim(int64_t bdim) { + // NB: you MUST call refreshTensorMetadata after doing this. + bdim_ = bdim; + } + private: + // see NOTE: [BatchedTensorImpl levels invariant] + void checkInvariants() const; + const char* tensorimpl_type_name() const override; + + Tensor value_; + + int64_t level_; + int64_t bdim_; +}; + +// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a +// BatchedTensorImpl. +inline bool isBatchedTensor(const Tensor& tensor) { + return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::FuncTorchBatched) || + tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::BatchedNestedTensor); +} + +// It is unsafe to call this on a Tensor that is not backed by a +// BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible. +inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) { + return static_cast(tensor.unsafeGetTensorImpl()); +} + +inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) { + if (!isBatchedTensor(tensor)) { + return nullptr; + } + return unsafeGetBatchedImpl(tensor); +} + +// Returns a bitset. If bit i is set, then that means dim i is a batchdim. +inline std::bitset createBatchDimBitset(int64_t dim) { + std::bitset is_bdim; + is_bdim.set(dim); + return is_bdim; +} + +// Creates a bitset for the given level +inline std::bitset createVmapLevelsBitset(int64_t level) { + std::bitset result; + result.set(level); + return result; +} + +// Use this to construct a BatchedTensor from a regular Tensor +TORCH_API Tensor makeBatched(Tensor tensor, int64_t dim, int64_t level); + +// Adds a batch dim to `tensor`, returning a BatchedTensor +TORCH_API Tensor addBatchDim(Tensor tensor, int64_t dim, int64_t level); + +// Certain dispatch keys must be propagated to the BatchedTensor (or, in general, +// any wrapper Tensor subclasses). This is because there are methods on Tensor +// that skip dispatch and check for the presence of a dispatch key (e.g. is_cpu()). +// TODO: should probably contain more (or all?) backend keys +constexpr DispatchKeySet kKeysToPropagateToWrapper({ + DispatchKey::Negative, + DispatchKey::Conjugate, + DispatchKey::XLA, + DispatchKey::XPU, + DispatchKey::HPU, + DispatchKey::CUDA, + DispatchKey::CPU, + DispatchKey::PrivateUse1, + DispatchKey::SparseCPU, + DispatchKey::SparseCUDA, + DispatchKey::SparseCsrCPU, + DispatchKey::SparseCsrCUDA, +}); + +inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) { + auto key_set = tensor.unsafeGetTensorImpl()->key_set(); + return key_set & kKeysToPropagateToWrapper; +} + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchingMetaprogramming.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchingMetaprogramming.h new file mode 100644 index 0000000000000000000000000000000000000000..7351ea6fb52283786dae5dd24efe6dcdd4bea698 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchingMetaprogramming.h @@ -0,0 +1,131 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include +#include + +// This file contains template metaprogramming things that are used for our +// batching rules. +// +// See NOTE: [vmap plumbing] for more details on why this is necessary. +// The plumbing has a bunch of metaprogramming hacks for determining the signature +// of a batching rule from the signature of the operator, many of which use the +// helper functions in this file. + +namespace at::functorch { + +// Metaprogramming things +template using typelist = c10::guts::typelist::typelist; +template using head_t = c10::guts::typelist::head_t; +template using concat_t = c10::guts::typelist::concat_t; +template class debug_t; + +// tail operation +template +struct tail final { + static_assert(c10::guts::false_t::value, + "In typelist::tail, the T argument must be typelist<...>."); +}; +template +struct tail> final { + using type = typelist; +}; +template using tail_t = typename tail::type; + +template +struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext { + using type = Next; +}; +template +struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext, Next, Tail> { + using type = Tail; +}; +template +struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext, Next, Tail> { + using type = Tail; +}; +template +struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext, Next, Tail> { + using type = Tail; +}; +template +struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext, std::optional, Next, Tail> { + using type = Tail; +}; +template +struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext&, std::optional, Next, Tail> { + using type = Tail; +}; +template +struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext&, std::optional, Next, Tail> { + using type = Tail; +}; +template +struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext, std::optional, Next, Tail> { + using type = Tail; +}; +template struct RemoveBatchDimAfterTensor { + using first = head_t; + using next = tail_t; + using second = head_t; + using tail = tail_t; + + using type = concat_t< + typelist, + typename RemoveBatchDimAfterTensor< + typename IfFirstIsTensorAndSecondisBatchDimThenTailElseNext::type + >::type + >; +}; +template struct RemoveBatchDimAfterTensor> { + using type = typelist; +}; +template <> struct RemoveBatchDimAfterTensor> { + using type = typelist<>; +}; +template using remove_batch_dim_after_tensor_t = typename RemoveBatchDimAfterTensor::type; + +template struct UnpackSingleItemTuple { + using type = T; +}; +template struct UnpackSingleItemTuple> { + using type = T; +}; +template using unpack_single_item_tuple_t = typename UnpackSingleItemTuple::type; + +template struct BuildFunctionHelper; +template struct BuildFunctionHelper> { + using type = Return(Args...); +}; +template +struct BuildFunction { + using type = typename BuildFunctionHelper>::type; +}; +template using build_function_t = typename BuildFunction::type; + + +template struct ToOperatorType { + using batch_rule_return_type = typename c10::guts::function_traits::return_type; + using batch_rule_parameter_types = typename c10::guts::function_traits::parameter_types; + + using operator_parameter_types = remove_batch_dim_after_tensor_t; + using operator_return_type = + unpack_single_item_tuple_t< + c10::guts::typelist::to_tuple_t< + remove_batch_dim_after_tensor_t< + c10::guts::typelist::from_tuple_t>>>; + + using type = build_function_t; +}; +template using to_operator_t = typename ToOperatorType::type; + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/DynamicLayer.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/DynamicLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..56425b673e7f325dc9ca5749b218e77e825e0de1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/DynamicLayer.h @@ -0,0 +1,129 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Forward declared +namespace c10 { struct AutogradMetaInterface; } + +namespace at::functorch { + +// This file contains the implementation of functorch's interpreter stack. +// See NOTE: [functorch interpreter stack] first before reading on. +// +// NB: the functorch interpreter stack is also referred to as: +// - the "dynamic layer stack" -- an older name for "interpreter" was +// "dynamic layer". +// - the "functorch mode stack". You can think of each functorch transform as a +// "mode" (in the same sense as torch_dispatch mode or torch_function mode), +// and functorch being an implementation of a "mode stack" where the modes +// may be arbitrary composed. + +// DynamicLayer is basically the same thing as an Interpreter. +// It represents a functorch transform and it holds an Interpreter, +// which contains metadata related to the transform and instructions on +// how to perform the transform. +// +// TODO: we can excise DynamicLayer in favor of Interpreter, +// But I am going to leave it for now as a compatibility shim to avoid +// needing to refactor a lot of callsites... +struct TORCH_API DynamicLayer { + explicit DynamicLayer( + TransformType transform_type, + int64_t layerId, + std::optional batchSize = std::nullopt, + std::optional randomness = std::nullopt, + std::optional prev_grad_mode = std::nullopt, + std::optional pre_fwd_grad_mode = std::nullopt, + std::optional functionalize_add_back_views = std::nullopt); + + TransformType key() const; + int64_t layerId() const; + + const Interpreter& interpreter() const { return interpreter_; } + Interpreter& interpreter() { return interpreter_; } + + // Only valid for vmap + c10::SymInt batchSize() const; + RandomnessType randomness() const; + + private: + Interpreter interpreter_; +}; + +TORCH_API int64_t initAndPushDynamicLayer( + TransformType transform_type, + std::optional batch_size = std::nullopt, + std::optional randomness = std::nullopt, + std::optional prev_grad_mode = std::nullopt, + std::optional prev_fwd_grad_mode = std::nullopt, + std::optional functionalize_add_back_views = std::nullopt); +TORCH_API DynamicLayer popDynamicLayerAndDeleteMetadata(); +TORCH_API std::optional maybeCurrentDynamicLayer(); +TORCH_API const std::vector& getDynamicLayerStack(); +TORCH_API void setDynamicLayerStack(const std::vector& stack); +TORCH_API void setDynamicLayerFrontBackKeysIncluded(bool included); + +// NOTE: [Life handles and lexically scoped transforms] +// functorch transforms are lexically scoped. +// Given a level, we store a "life handle" that is a boolean that tells us if the +// transform with that level is active or not. +// +// functorch's TensorWrapper (for grad transforms) stores a life handle. +// If a TensorWrapper escapes from the scope of the transform, then somehow +// it must know it escaped; it can tell by querying the life handle. +TORCH_API const std::shared_ptr& getLifeHandleForLevel(int64_t level); + +// Returns if an operator is in-place. An operator is inplace if: +// 1. The first argument is a Tensor and it is being written to +// 2. The first argument is being returned +// 3. No other arguments are aliased +// Here is an example of an in-place operator: +// add_(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) +TORCH_API bool isInplaceOp(const c10::FunctionSchema& schema); + +// Given the indices of unwrapped inputs and the schema, this returns the indices of any outputs that should remain unwrapped +TORCH_API std::optional findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input); + +TORCH_API Tensor unwrapIfDead(const Tensor& tensor); +TORCH_API bool isDeadTensorWrapper(const Tensor& tensor); + +// Pretty printers +TORCH_API std::ostream& operator<<(std::ostream& os, const DynamicLayer& layer); +TORCH_API std::ostream& operator<<(std::ostream& os, const std::vector& dynamicLayerStack); + +// While a functorch transform is active, torch.autograd.function._SingleLevelFunction +// is disabled by default. The following two APIs are APIs for enabling +// it. These are not user-facing APIs. We can delete this in the future, but +// it is useful for debugging when something goes wrong with the +// autograd.Function <> functorch interaction, which uses _SingleLevelFunction, +// because it leads to loud errors if something is incorrect. +TORCH_API void setSingleLevelAutogradFunctionAllowed(bool allowed); +TORCH_API bool getSingleLevelAutogradFunctionAllowed(); + +// While a functorch grad transform is active, Tensor.requires_grad_() gets +// disabled. These two functions are the mechanism to controlling that. +TORCH_API void setInplaceRequiresGradAllowed(bool allowed); +TORCH_API bool getInplaceRequiresGradAllowed(); + +TORCH_API DynamicLayer popDynamicLayer(); +TORCH_API int64_t pushDynamicLayer(DynamicLayer&& layer); + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h new file mode 100644 index 0000000000000000000000000000000000000000..7b11100e7075cdb5b0d45dd9c6c5844608c504fe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::functorch { + +// This is the interpreter that handles the functionalize() transform. +// See NOTE: [functorch interpreter stack] for more details. + +struct FunctionalizeInterpreterPtr { + explicit FunctionalizeInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Functionalize); } + TransformType key() const { return base_->key(); } + int64_t level() const { return base_->level(); } + void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack); + void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case); + bool functionalizeAddBackViews() const { + return std::get(base_->meta()).functionalizeAddBackViews_; + } + private: + const Interpreter* base_; +}; + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/Interpreter.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/Interpreter.h new file mode 100644 index 0000000000000000000000000000000000000000..25c8e6f0dac29208608e4e2ec6582f3163a3cbe8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/Interpreter.h @@ -0,0 +1,358 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at::functorch { + +// NOTE: [functorch interpreter stack] +// +// functorch's dispatching system uses a stack of interpreters. +// Historically we've referred to this as the "DynamicLayerStack". +// +// An interpreter is something that reads in the code it is passed +// and then executes it. We have a different interpreter per-transform: +// the "VmapInterpreter" is responsible for reading in operators (like aten::mv) +// and executing the batched version of it (the batching rule for aten::mv). +// +// Concretely, each interpreter is responsible for two things: +// +// 1) process(ophandle, stack) +// Given an operator handle and a stack of arguments, the interpreter is +// responsible for figuring out how to execute the operation under the semantics +// of the interpreter. For e.g. VmapInterpreter, this is figuring out how to call +// the batching rule. +// +// The batching rules are stored as kernels on the FuncTorchBatched key, so the way +// VmapInterpreter calls the batching rule is roughly: (A) exclude all +// dispatch keys aside from the Batched key, (B) redispatch so we get to the +// Batched key. +// +// 2) sendToNextInterpreter(ophandle, stack) +// The VmapInterpreter, when it sees aten::mv, will process it into a call to +// aten::mm. It then needs to send the call to aten::mm to the next interpreter +// in the interpreter stack. +// +// The VmapInterpreter just does this via a call to ophandle.callBoxed(stack) +// and most Interpreters will implement it this way. + +enum class RandomnessType { + Error, // always errors when calling a random function + Same, // randomness appears the same across batches + Different, // randomness appears different across batches + END +}; + +enum class TransformType { + Torch, // Unused + Vmap, + Grad, // reverse-mode AD, aka vjp + Jvp, // forward-mode AD + Functionalize, +}; + +std::ostream& operator<<(std::ostream& os, const TransformType& t); + +// NOTE: [Interpreter "subclassing" design] +// +// How are various Interpreters for different transforms (vmap, grad, ...) +// implemented? +// +// Accessing interpreters is in the hot-path of functorch so we have a constraint +// that this code must be as fast as possible. +// +// As a result, we stay away from virtual methods and this causes our code +// to look a little funny. +// +// `Interpreter` is the struct for Interpreters. It holds ALL of the +// relevant information (what type of interpreter it is and the metadata). +// Metadata for each interpreter is represented as a Union (std::variant) +// of all possible metadata (VmapInterpreterMeta, GradInterpreterMeta, ...). +// +// Given an Interpreter, how do I get a "VmapInterpreter"? You may wish to do this +// if you want to access the metadata fields (like batchSize and randomness). +// +// Each type of interpreter (e.g. Vmap) has a convenience struct +// (e.g. VmapInterpreterPtr) associated with it. +// +// Construct the convenience struct with VmapInterpreterPtr(Interpreter*), +// and then one can access methods on VmapInterpreterPtr like so: +// >>> VmapInterpreterPtr(&interpreter).batchSize() +// +// Finally, Interpreter::process switches on the type of the interpreter +// and calls one of {Transform}Interpreter::processImpl under the hood. +// Same for Interpreter::sendToNextInterpreter :) + +struct VmapInterpreterMeta { + explicit VmapInterpreterMeta(c10::SymInt batchSize, RandomnessType randomness) : + batchSize_(std::move(batchSize)), randomness_(randomness) {} + + c10::SymInt batchSize_; + RandomnessType randomness_; + + VmapInterpreterMeta() = default; + VmapInterpreterMeta(const VmapInterpreterMeta&) = default; + VmapInterpreterMeta(VmapInterpreterMeta&&) = default; + VmapInterpreterMeta& operator=(const VmapInterpreterMeta&) = default; + VmapInterpreterMeta& operator=(VmapInterpreterMeta&&) = default; + ~VmapInterpreterMeta() = default; + + template + friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) { + TORCH_CHECK( + !json_t.batchSize_.is_heap_allocated(), + "Serialization for heap-allocated SymInt is not implemented yet" + ); + json_j["batchSize"] = json_t.batchSize_.as_int_unchecked(); + json_j["randomness"] = static_cast(json_t.randomness_); + } + + template + friend void from_json(const T& json_j, VmapInterpreterMeta& json_t) { + json_t.batchSize_ = c10::SymInt(SymInt::Unchecked::UNCHECKED, json_j["batchSize"]); + json_t.randomness_ = static_cast(json_j["randomness"]); + } +}; + +struct GradInterpreterMeta { + explicit GradInterpreterMeta(bool prevGradMode): prevGradMode_(prevGradMode) {} + GradInterpreterMeta() = default; + GradInterpreterMeta(const GradInterpreterMeta&) = default; + GradInterpreterMeta(GradInterpreterMeta&&) = default; + GradInterpreterMeta& operator=(const GradInterpreterMeta&) = default; + GradInterpreterMeta& operator=(GradInterpreterMeta&&) = default; + ~GradInterpreterMeta() = default; + + bool prevGradMode_; + template + friend void to_json(T& json_j, const GradInterpreterMeta& json_t) { + json_j["prevGradMode"] = json_t.prevGradMode_; + } + + template + friend void from_json(const T& json_j, GradInterpreterMeta& json_t) { + json_t.prevGradMode_ = json_j["prevGradMode"]; + } +}; + +struct JvpInterpreterMeta { + explicit JvpInterpreterMeta(bool prevFwdGradMode) : prevFwdGradMode_(prevFwdGradMode) {} + JvpInterpreterMeta() = default; + JvpInterpreterMeta(const JvpInterpreterMeta&) = default; + JvpInterpreterMeta(JvpInterpreterMeta&&) = default; + JvpInterpreterMeta& operator=(const JvpInterpreterMeta&) = default; + JvpInterpreterMeta& operator=(JvpInterpreterMeta&&) = default; + ~JvpInterpreterMeta() = default; + + bool prevFwdGradMode_; + template + friend void to_json(T& json_j, const JvpInterpreterMeta& json_t) { + json_j["prevFwdGradMode"] = json_t.prevFwdGradMode_; + } + + template + friend void from_json(const T& json_j, JvpInterpreterMeta& json_t) { + json_t.prevFwdGradMode_ = json_j["prevFwdGradMode"]; + } +}; + +struct FunctionalizeInterpreterMeta { + explicit FunctionalizeInterpreterMeta(bool functionalizeAddBackViews) : + functionalizeAddBackViews_(functionalizeAddBackViews) {} + FunctionalizeInterpreterMeta() = default; + FunctionalizeInterpreterMeta(const FunctionalizeInterpreterMeta&) = default; + FunctionalizeInterpreterMeta(FunctionalizeInterpreterMeta&&) = default; + FunctionalizeInterpreterMeta& operator=(const FunctionalizeInterpreterMeta&) = default; + FunctionalizeInterpreterMeta& operator=(FunctionalizeInterpreterMeta&&) = default; + ~FunctionalizeInterpreterMeta() = default; + + bool functionalizeAddBackViews_; + template + friend void to_json(T& json_j, const FunctionalizeInterpreterMeta& json_t) { + json_j["functionalizeAddBackViews"] = json_t.functionalizeAddBackViews_; + } + + template + friend void from_json(const T& json_j, FunctionalizeInterpreterMeta& json_t) { + json_t.functionalizeAddBackViews_ = json_j["functionalizeAddBackViews"]; + } +}; + +typedef std::variant< + int64_t, + GradInterpreterMeta, + JvpInterpreterMeta, + VmapInterpreterMeta, + FunctionalizeInterpreterMeta +> InterpreterMeta; + + +struct Interpreter { + // factory functions + static Interpreter Vmap(int64_t level, c10::SymInt batchSize, RandomnessType randomness) { + return Interpreter(TransformType::Vmap, level, VmapInterpreterMeta(std::move(batchSize), randomness)); + } + static Interpreter Grad(int64_t level, bool prevGradMode) { + return Interpreter(TransformType::Grad, level, GradInterpreterMeta(prevGradMode)); + } + static Interpreter Jvp(int64_t level, bool prevFwdGradMode) { + return Interpreter(TransformType::Jvp, level, JvpInterpreterMeta(prevFwdGradMode)); + } + static Interpreter Functionalize(int64_t level, bool functionalizeAddBackViews) { + return Interpreter(TransformType::Functionalize, level, FunctionalizeInterpreterMeta(functionalizeAddBackViews)); + } + + // methods + TransformType key() const { return type_; } + int64_t level() const { return level_; } + const InterpreterMeta& meta() const { return meta_; } + + void process(const c10::OperatorHandle& op, torch::jit::Stack* stack); + void sendToNextInterpreter(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case); + + void saveLocalDispatchKeySet(c10::impl::LocalDispatchKeySet keyset) { + TORCH_INTERNAL_ASSERT(!savedLocalDispatchKeySet_.has_value()); + savedLocalDispatchKeySet_ = keyset; + } + void clearSavedLocalDispatchKeySet() { + TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value()); + savedLocalDispatchKeySet_ = std::nullopt; + } + c10::impl::LocalDispatchKeySet getSavedLocalDispatchKeySet() const { + TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value()); + return *savedLocalDispatchKeySet_; + } + + // An Interpreter is alive if we are currently inside the ongoing transform + // for the interpreter. For example, vmap(f)(x); inside of f, the vmap's + // corresponding Interpreter is alive, even when it is not on the DynamicLayerStack. + bool is_alive() const { + return *is_alive_; + } + const std::shared_ptr& is_alive_ptr() const { + return is_alive_; + } + void set_is_alive(bool alive) { + *is_alive_ = alive; + } + + // Please don't use this + explicit Interpreter() = default; + + template + friend void to_json(T& json_j, const Interpreter& json_t) { + json_j["type"] = static_cast(json_t.type_); + json_j["level"] = json_t.level_; + if (json_t.savedLocalDispatchKeySet_) { + json_j["savedLocalDispatchKeySet"] = { + {"included", json_t.savedLocalDispatchKeySet_->included_.raw_repr()}, + {"excluded", json_t.savedLocalDispatchKeySet_->excluded_.raw_repr()} + }; + } else { + json_j["savedLocalDispatchKeySet"] = nlohmann::json(); + } + json_j["is_alive"] = *json_t.is_alive_; + std::visit([&](auto&& arg) { + using V = std::decay_t; + if constexpr (std::is_same_v) { + json_j["meta"] = {{"Torch", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Grad", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Jvp", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Vmap", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Functionalize", arg}}; + } else { + static_assert(false && sizeof(V), "unknown variant case"); + } + }, json_t.meta_); + } + + template + friend void from_json(const T& json_j, Interpreter& json_t) { + json_t.type_ = static_cast(json_j["type"]); + json_t.level_ = json_j["level"]; + auto savedLocalDispatchKeySet = json_j["savedLocalDispatchKeySet"]; + if (savedLocalDispatchKeySet.is_null()) { + json_t.savedLocalDispatchKeySet_ = std::nullopt; + } else { + c10::impl::PODLocalDispatchKeySet pod; + pod.set_included(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["included"].template get())); + pod.set_excluded(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["excluded"].template get())); + json_t.savedLocalDispatchKeySet_ = c10::impl::LocalDispatchKeySet(pod); + } + json_t.is_alive_ = std::make_shared(json_j["is_alive"]); + auto meta = json_j["meta"]; + if (meta.contains("Torch")) { + json_t.meta_.emplace(meta["Torch"].template get()); + } else if (meta.contains("Grad")) { + json_t.meta_.emplace(meta["Grad"].template get()); + } else if (meta.contains("Jvp")) { + json_t.meta_.emplace(meta["Jvp"].template get()); + } else if (meta.contains("Vmap")) { + json_t.meta_.emplace(meta["Vmap"].template get()); + } else if (meta.contains("Functionalize")) { + json_t.meta_.emplace(meta["Functionalize"].template get()); + } else { + TORCH_CHECK(false, "unknown interpreter metadata type"); + } + } + + std::string serialize() const { + return nlohmann::json(*this).dump(); + } + + static Interpreter deserialize(const std::string& serialized) { + return nlohmann::json::parse(serialized).get(); + } + + private: + explicit Interpreter(TransformType type, int64_t level, InterpreterMeta meta): + type_(type), level_(level), is_alive_(std::make_shared(false)), meta_(std::move(meta)) {} + + // fields + TransformType type_{}; + int64_t level_{}; + std::optional savedLocalDispatchKeySet_; + std::shared_ptr is_alive_; + InterpreterMeta meta_; +}; + +// Applies the following for-loop: +// for i in range(begin, end): +// args[i] = func(args[i]) +void foreachTensorInplace(std::vector& args, int64_t begin, int64_t end, + std::function func); + +// Applies the following for-loop: +// for i in range(begin, end): +// if use_flag_relative[i] == 1: <-- treats use_flag_relative as a bitset +// args[i] = func(args[i], i - begin, true) +// args[i] = func(args[i], i - begin) +void foreachTensorInplaceWithFlag(std::vector& args, int64_t begin, int64_t end, + const std::bitset<64> use_flag_relative, const std::function& func); + +std::vector findUnwrappedInputs(std::vector& args, int64_t begin, int64_t end); + +DispatchKeySet keysToExcludeWhenEnteringDynamicLayer(TransformType key); + +void setup_dispatch_key_tls(TransformType key, DispatchKeySet include); + +void sanityCheckStack(const c10::OperatorHandle& op, torch::jit::Stack* stack); + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/LegacyVmapTransforms.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/LegacyVmapTransforms.h new file mode 100644 index 0000000000000000000000000000000000000000..aa7e4e2645bfa7ea7c63e3cc9ec99721769697f6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/LegacyVmapTransforms.h @@ -0,0 +1,192 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +namespace at::functorch { + +// This files contains the legacy (now-deprecated) batching rule API. +// Please try to use the new-style batching rule API (see writing_batch_rules.md) + +// This file contains abstractions used for transforming *logical* vmap arguments +// into *physical* arguments. (Keep reading for definitions of these terms). + +// NOTE: [Logical vs physical args] +// Consider the following vmap. +// vmap(vmap(func, in_dims=(2,)), in_dims=(0,))(torch.ones(2, 3, 4)) +// This would produce a BatchedTensor wrapping a Tensor of size [2, 3, 4], +// with batch dims 0 and 2: +// BatchedTensor(ones(2, 3, 4), bdims=[(lvl=1,dim=0),(lvl=2,dim=2)]) +// +// We say the *logical* view of the tensor has size [3] -- tensors inside +// `func` appear to have size [3]. +// However, the *physical* underlying tensor (the one passed to vmap) has size +// [2, 3, 4]. +// +// This notion of logical vs physical also extends to non-tensor arguments. +// Consider the previous tensor; let's assume the user called +// `torch.sum(tensor, dim=0)` inside of `func`. Then the logical +// dimension they are reducing over is dim 0 but the physical dim is dim 1 +// (the first non-batch dimension) + +// Forward declared; see NOTE: [What is a VmapPhysicalView?] +struct VmapPhysicalView; + +// Most PyTorch operators take 4 or fewer inputs. +constexpr int64_t kVmapTransformStaticInputSize = 4; +using VmapPhysicalViewVec = SmallVector; + +// Pytorch generally advertises good performance for <= 5 dims. +// (see ATen/core/DimVector.h). We add a few extra dims (~3) for vmap +// dimensions to get 8. Adjust this number as necessary +constexpr int64_t kVmapStaticDimVecSize = 8; +using VmapDimVector = SmallVector; +using VmapSymDimVector = SmallVector; + +// NOTE: [What is an VmapTransform?] +// An *VmapTransform* converts logical views of tensors to physical views. +// +// Batching rules use VmapTransforms to convert logical arguments to +// physical arguments, then call one or more at:: operator that handles the +// physical arguments, and then converts the physical result back to a logical +// argument. + +// VmapTransform for operators that take tensors with multiple batch dims. +// Given one or more logical views on Tensors, `logicalToPhysical` +// permutes all of the batch dims to the front of the tensor, aligns +// and expands the batch dims to match each other (according to their `level`), +// and returns a VmapPhysicalView on the tensor(s). +struct TORCH_API MultiBatchVmapTransform { + static VmapPhysicalView logicalToPhysical(const Tensor& logical_tensor); + static VmapPhysicalViewVec logicalToPhysical(ITensorListRef logical_tensors); +}; + +// VmapTransform for operators that broadcast all inputs. +// Given some logical views on Tensors, `logicalToPhysical`: +// - permutes all of the batch dims to the front of the tensors +// - aligns all the batch dims to the collective levels of all of the tensors. +// If a tensor does not have a batch dim for a vmap level, then it receives +// a size-one dimension for said level. +// - aligns the non-batch dims to have the same dimensionality, adding extra +// size-1 dimensions in between the batch dimensions and the non-batch dimensions +// so that the batch dimensions are lined up from the right. +// +// For example: given inputs of size (B, 2) and (B, 3, 2) where B is the batch +// dimension, BroadcastingVmapTransform returns VmapPhysicalViews that wrap tensors +// of size (B, 1, 2) and (B, 3, 2). +// +// Given inputs of size (B, 2) and (2,), BroadcastingVmapTransform returns +// VmapPhysicalViews wrapping tensors of size (B, 2) and (1, 2). We don't +// actually *need* to return a tensor of size (1, 2) for the second tensor +// because the broadcasting operation takes care of that for us, but we do +// it anyways to keep things simple. +struct TORCH_API BroadcastingVmapTransform { + static VmapPhysicalViewVec logicalToPhysical(TensorList logical_tensors); +}; + +// Forward declared, if you're reading this file head to toe, don't worry about +// it yet. +struct VmapPhysicalToLogicalMap; + +// NOTE: [What is a VmapPhysicalView?] +// VmapPhysicalView represents a physical view on a Tensor. +// +// One can use it to further convert logical dimension indices, logical shapes, +// and more to their physical variants, or convert a new (physical) tensor into +// a logical BatchedTensor. (TODO(rzou): some of these are not yet implemented). +// +// VmapPhysicalView stores a physical tensor with all of its batch dimensions at +// the front and some levels that correspond to said batch dimensions. +// +// The levels bitset specifies which vmap levels correspond to the batch +// dimensions at the front of the tensor. In particular, the number of set bits +// corresponds to the number of batch dimensions on `tensor` and the rightmost +// bit of `levels` specifies the maximum number of nested vmaps we are in at +// this point in time. +// For example, given: +// physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5, 6), levels={1, 3}) +// +// Rightmost bit of `levels` is 3 indicating the number of nested vmaps less +// than or equal to 3. +// bitset: 010100 +// ^ +// | +// levels: 012345 +struct TORCH_API VmapPhysicalView { + VmapPhysicalView(Tensor&& tensor, std::bitset levels) + : levels_(levels), tensor_(std::move(tensor)) { + // TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor)); + } + + Tensor& tensor() { return tensor_; } + const Tensor& tensor() const { return tensor_; } + + // Maps logical dim indices to physical dim indices. Also does dim wrapping. + // + // For example, given: + // physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5), levels={1, 3}) + // + // Then physical_view.getPhysicalDims({0, 1}) returns {2, 3}. + // This is because the size of levels tell us that the first two dimensions + // of `tensor_` are batch dimensions, so a logical dim of `n` is actually + // a physical dim of `n + 2`. + VmapDimVector getPhysicalDims(IntArrayRef logical_dims) const; + int64_t getPhysicalDim(int64_t logical_dim) const; + + // Returns a VmapPhysicalToLogicalMap object. This can be used for + // mapping a physical tensor to a new logical tensor (BatchedTensor) + VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const; + + // Maps a logical shape to a physical shape by prepending the batch + // sizes to the logical shape. + VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const; + SymDimVector getPhysicalShape(c10::SymIntArrayRef logical_shape) const; + + int64_t numBatchDims() const; + + private: + int64_t numLogicalDims() const; + + std::bitset levels_; + Tensor tensor_; +}; + +// Convenience struct used for mapping a physical tensor (a non-BatchedTensor) +// to a logical one (BatchedTensor). It holds some levels that are used to do the +// mapping and assumes that the batch dimensions in the physical tensor all +// occur at the front of the tensor. +struct TORCH_API VmapPhysicalToLogicalMap { + VmapPhysicalToLogicalMap(std::bitset levels): levels_(levels) {} + + // Maps a physical tensor to a new logical tensor (BatchedTensor). + // Assumes that all of the "batch dimensions" are at the front + // of the physical tensor. For example, given: + // - x = rank-4 Tensor with size 2, 3, 5, 7 + // - levels = (2, 4) + // Returns: + // - BatchedTensor(x, bdims=[(dim=0,lvl=2), (dim=1, lvl=4)]) + Tensor apply(const Tensor& physical_tensor) const; + + // Given a vector of physical tensors, + // 1. maps each tensor to a new logical tensor. Assumes that all of the + // "batch dimensions" are at the front of the physical tensors. + // 2. stores the new logical tensors back into the passed-in vector. This is + // to avoid additional dynamic allocations. + void applyInplace(std::vector& physical_tensors) const; + + std::bitset levels_; +}; + + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/Macros.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/Macros.h new file mode 100644 index 0000000000000000000000000000000000000000..bd7386a3a3cd55c40a744d2c8700c0a6021a008c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/Macros.h @@ -0,0 +1,8 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#define SINGLE_ARG(...) __VA_ARGS__ + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/PlumbingHelper.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/PlumbingHelper.h new file mode 100644 index 0000000000000000000000000000000000000000..4ba1c6bf041735558f119d0921921d2b10ba0d9f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/PlumbingHelper.h @@ -0,0 +1,68 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +#pragma once +#include +#include +#include + +// NOTE: [vmap plumbing] +// +// Here's how "batching rules" work. +// - we register kernels to the Batched key +// - these kernels have the same signatures as the original operators. +// For example, at::sin(Tensor self) accepts a Tensor, and the batched kernel +// must also accept a Tensor +// - However, it is more natural for users to write a batching rule like the +// following: sin_batch_rule(Tensor self, std::optional self_bdim) +// - There is some codegenerated layer (the "plumbing") that wraps the user +// defined batching rule (e.g. sin_batch_rule) in a kernel that can be +// registered to the Batched key. +// +// The plumbing is responsible for wrapping a batching rule into a form that may +// be registered as the kernel for the batched key. + +namespace at::functorch { + +void vmap_check_escaped(const std::optional &layer, const char* what); + +// Create a BatchedTensor given a tensor, bdim, and level +TORCH_API Tensor makeBatched(Tensor tensor, std::optional bdim, int64_t level); + +// Given a Tensor that may or may not be a BatchedTensor, unwrap it. +// If `tensor` is not a BatchedTensor, or is a BatchedTensor but the level +// doesn't match, then this returns (tensor, std::nullopt). +// Otherwise, it returns (unwrap(tensor), bdim). +TORCH_API std::tuple> unwrapTensorAtLevel(const Tensor& tensor, int64_t level); + +// Creates a vector of BatchedTensor +TORCH_API std::vector makeBatchedVector(std::vector tensors, std::optional bdim, int64_t level); + +// Returns True if ANY tensor in tensors is batched at level +TORCH_API bool isBatchedAtLevel(ITensorListRef tensors, int64_t level); +TORCH_API bool isBatchedAtLevel(const c10::List>& maybe_tensors, int64_t level); +TORCH_API bool isBatchedAtLevel(const Tensor& tensor, int64_t level); +TORCH_API bool isBatchedAtLevel(const std::optional& maybe_tensor, int64_t level); + +// Convenience helper. Returns true if any tensor is batched at level +TORCH_API bool areAnyBatchedAtLevel(ArrayRef> maybe_tensors, int64_t level); + +inline bool ivalueParticipatesInCurrentLevel(const IValue& ivalue) { + if (ivalue.isTensor()) { + auto maybe_level = maybeCurrentDynamicLayer(); + TORCH_INTERNAL_ASSERT(maybe_level.has_value()); + auto current_level = maybe_level->layerId(); + return isBatchedAtLevel(ivalue.toTensor(), current_level); + } + // TODO: should really check this + return false; +} + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/TensorWrapper.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/TensorWrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..4f2d6f5c708b58f2997d40051c7bdbe9cf3f236f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/TensorWrapper.h @@ -0,0 +1,108 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include + +namespace at::functorch { + +// NOTE: [functorch's TensorWrapper] +// +// Taking better suggestions for a name. TensorWrapper is the wrapper Tensor +// Subclass for functorch's grad-based transforms (grad, vjp, jvp). It is +// analogous to how vmap uses BatchedTensor as the wrapper Tensor subclass. +// +// If you're familiar with the Tensor-Variable merge, TensorWrapper is effectively +// another Variable. +// +// Consider grad(grad(torch.sin))(x). This wraps `x` as TensorWrapper(TensorWrapper(x)). +// The reason why is so that each TensorWrapper can hold its own AutogradMeta and +// participate in a **separate** autograd graph. +// +// There are alternative designs we could have chosen (e.g. each grad transform +// stores a weak map of Tensor -> AutogradMeta); the benefit of the TensorWrapper +// design is that we can reuse existing VariableType kernels (i.e. Autograd kernels) +// without much modification. Since a TensorWrapper looks like a regular Tensor, +// the VariableType kernel can pull out the AutogradMeta struct from where it +// expects and extend the autograd graph + +struct TORCH_API TensorWrapper : public c10::TensorImpl { + explicit TensorWrapper( + c10::DispatchKeySet key_set, + Tensor value, + int64_t level, + std::shared_ptr is_alive, + bool is_immutable = false, // if true, this came from an operation that aliases an immutable tensor + bool use_value_sizes_strides = true); + + void refreshMetadata(); + + const Tensor& value() const { + return value_; + } + std::optional level() const { + if (is_alive()) { + return level_; + } + return {}; + } + bool is_immutable() const { + return is_immutable_; + } + bool is_alive() const; + + // Overrides necessary for autograd + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override; + c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const override; + void shallow_copy_from(const c10::intrusive_ptr& impl) override; + + private: + const char* tensorimpl_type_name() const override; + Tensor value_; + int64_t level_; + bool is_immutable_; + + // TensorWrapper receives a boolean flag on whether or not the Grad Interpreter + // that created it is still alive or not. + // If the Grad Interpreter is no longer alive then it attempts to behave like + // a regular Tensor. + // + // When we exit the level, this wrapper may be marked as "not alive". + // Wrappers that are not alive: + // 1) May still have autograd metadata on them + // 2) Forward dispatches to the underlying value() + std::shared_ptr is_alive_; +}; + +// There are two variants of makeTensorWrapper: one that accepts a level +// and one that accepts an Interpreter. +// +// The one that accepts a level tries to automatically get the life handle from the +// interpreter on the DynamicLayerStack. +// It needs to be used with caution: if the interpreter is not on the +// DynamicLayerStack, then we won't be able to find the life handle. +// +// In practice this isn't a problem: when we're constructing TensorWrapper in +// Python, the corresponding interpreter is on the stack. +TORCH_API Tensor makeTensorWrapper(const Tensor& tensor, int64_t level, bool is_immutable=false); +TORCH_API Tensor makeTensorWrapper(const Tensor& tensor, const Interpreter& interpreter, bool is_immutable=false); +TORCH_API TensorWrapper* maybeGetTensorWrapper(const Tensor& tensor); +TORCH_API void dumpTensor(std::ostream & ss, const Tensor& tensor); +TORCH_API void dumpTensorCout(const Tensor& tensor); + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/VmapInterpreter.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/VmapInterpreter.h new file mode 100644 index 0000000000000000000000000000000000000000..f1fc965730899b849d263d98d8faa0e891f777ae --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/VmapInterpreter.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::functorch { + +// This is the interpreter that handles the functionalize() transform. +// See NOTE: [functorch interpreter stack] for more details. + +struct VmapInterpreterPtr { + explicit VmapInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Vmap); } + TransformType key() const { return base_->key(); } + int64_t level() const { return base_->level(); } + void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack); + void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case); + c10::SymInt batchSize() const { + return std::get(base_->meta()).batchSize_; + } + RandomnessType randomness() const { + return std::get(base_->meta()).randomness_; + } + private: + const Interpreter* base_; +}; + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..005f4b0a55c787c61c76fbe4acbdc870e8dd9fb5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h @@ -0,0 +1,248 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +// Use of c10::hip namespace here makes hipification easier, because +// I don't have to also fix namespaces. Sorry! +namespace c10::hip { + +// Takes a valid HIPAllocator (of any sort) and turns it into +// an allocator pretending to be a CUDA allocator. See +// Note [Masquerading as CUDA] +class HIPAllocatorMasqueradingAsCUDA final : public HIPCachingAllocator::HIPAllocator { + HIPCachingAllocator::HIPAllocator* allocator_; +public: + explicit HIPAllocatorMasqueradingAsCUDA(HIPCachingAllocator::HIPAllocator* allocator) + : allocator_(allocator) {} + + virtual ~HIPAllocatorMasqueradingAsCUDA() = default; + + // From c10::Allocator + + DataPtr allocate(size_t size) override { + DataPtr r = allocator_->allocate(size); + r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index())); + return r; + } + + bool is_simple_data_ptr(const DataPtr& data_ptr) const override { + return allocator_->is_simple_data_ptr(data_ptr); + } + + DeleterFnPtr raw_deleter() const override { + return allocator_->raw_deleter(); + } + + void copy_data(void* dest, const void* src, std::size_t count) const final { + allocator_->copy_data(dest, src, count); + } + + // From DeviceAllocator + + bool initialized() override { + return allocator_->initialized(); + } + + void emptyCache(MempoolId_t mempool_id = {0, 0}) override { + allocator_->emptyCache(mempool_id); + } + + void recordStream(const DataPtr& ptr, c10::Stream stream) override { + HIPStream hip_stream = HIPStream(stream); + recordStream(ptr, hip_stream); + } + + CachingDeviceAllocator::DeviceStats getDeviceStats(c10::DeviceIndex device) override { + return allocator_->getDeviceStats(device); + } + + void resetAccumulatedStats(c10::DeviceIndex device) override { + allocator_->resetAccumulatedStats(device); + } + + void resetPeakStats(c10::DeviceIndex device) override { + allocator_->resetPeakStats(device); + } + + // From CUDAAllocator + + void* raw_alloc(size_t nbytes) override { + return allocator_->raw_alloc(nbytes); + } + + void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) override { + return allocator_->raw_alloc_with_stream(nbytes, stream); + } + + void raw_delete(void* ptr) override { + allocator_->raw_delete(ptr); + } + + void init(int device_count) override { + allocator_->init(device_count); + } + + double getMemoryFraction(c10::DeviceIndex device) override { + return allocator_->getMemoryFraction(device); + } + + void setMemoryFraction(double fraction, c10::DeviceIndex device) override { + allocator_->setMemoryFraction(fraction, device); + } + + std::vector getExpandableSegmentSizes(c10::DeviceIndex device) override { + return allocator_->getExpandableSegmentSizes(device); + } + + void enable(bool value) override { + allocator_->enable(value); + } + + bool isEnabled() const override { + return allocator_->isEnabled(); + } + + void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override { + allocator_->cacheInfo(device, largestBlock); + } + + void* getBaseAllocation(void* ptr, size_t* size) override { + return allocator_->getBaseAllocation(ptr, size); + } + + void recordStream(const DataPtr& ptr, HIPStream stream) override { + allocator_->recordStream(ptr, stream); + } + + HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) override { + return allocator_->snapshot(mempool_id); + } + + void beginAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + std::function filter) override { + allocator_->beginAllocateToPool(device, mempool_id, filter); + } + + void endAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id) override { + allocator_->endAllocateToPool(device, mempool_id); + } + + void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) override { + allocator_->releasePool(device, mempool_id); + } + + int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) override { + return allocator_->getPoolUseCount(device, mempool_id); + } + + void createOrIncrefPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + HIPAllocator* allocator = nullptr) override { + allocator_->createOrIncrefPool(device, mempool_id, allocator); + } + + void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) override { + allocator_->setUseOnOOM(device, mempool_id); + } + + void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) override { + allocator_->setNoSplit(device, mempool_id); + } + + bool checkPoolLiveAllocations( + c10::DeviceIndex device, + MempoolId_t mempool_id, + const std::unordered_set& expected_live_allocations) override { + return allocator_->checkPoolLiveAllocations(device, mempool_id, expected_live_allocations); + } + + HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) override { + return allocator_->shareIpcHandle(ptr); + } + + std::shared_ptr getIpcDevPtr(std::string handle) override { + return allocator_->getIpcDevPtr(handle); + } + + bool isHistoryEnabled() override { + return allocator_->isHistoryEnabled(); + } + + void recordHistory( + bool enabled, + HIPCachingAllocator::CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + HIPCachingAllocator::RecordContext when, + bool clearHistory) override { + allocator_->recordHistory(enabled, context_recorder, alloc_trace_max_entries, when, clearHistory); + } + + void recordAnnotation( + const std::vector>& md) override { + allocator_->recordAnnotation(md); + } + + void pushCompileContext(std::string& md) override { + allocator_->pushCompileContext(md); + } + + void popCompileContext() override { + allocator_->popCompileContext(); + } + + void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) override { + allocator_->attachOutOfMemoryObserver(observer); + } + + void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) override { + allocator_->attachAllocatorTraceTracker(tracker); + } + + void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) override { + allocator_->enablePeerAccess(dev, dev_to_access); + } + + hipError_t memcpyAsync( + void* dst, + int dstDevice, + const void* src, + int srcDevice, + size_t count, + hipStream_t stream, + bool p2p_enabled) override { + return allocator_->memcpyAsync(dst, dstDevice, src, srcDevice, count, stream, p2p_enabled); + } + + std::shared_ptr getCheckpointState( + c10::DeviceIndex device, + MempoolId_t id) override { + return allocator_->getCheckpointState(device, id); + } + + HIPCachingAllocator::CheckpointDelta setCheckpointPoolState( + c10::DeviceIndex device, + std::shared_ptr pps) override { + auto cpd = allocator_->setCheckpointPoolState(device, pps); + for (auto& ptr : cpd.dataptrs_allocd) { + ptr.unsafe_set_device(Device(c10::DeviceType::CUDA, ptr.device().index())); + } + return cpd; + } + + std::string name() override { + return allocator_->name(); + } + +}; + +} // namespace c10::hip + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..5f0214ee3c8c9d23a07aa2070f92e78bdbc326a8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h @@ -0,0 +1,203 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { +// forward declaration +class DataPtr; +namespace hip { +namespace HIPCachingAllocatorMasqueradingAsCUDA { + +C10_HIP_API HIPCachingAllocator::HIPAllocator* get(); +C10_HIP_API void recordStreamMasqueradingAsCUDA(const DataPtr& ptr, HIPStreamMasqueradingAsCUDA stream); + +inline void* raw_alloc(size_t nbytes) { + return get()->raw_alloc(nbytes); +} + +inline void* raw_alloc_with_stream(size_t nbytes, hipStream_t stream) { + return get()->raw_alloc_with_stream(nbytes, stream); +} + +inline void raw_delete(void* ptr) { + return get()->raw_delete(ptr); +} + +inline void init(int device_count) { + return get()->init(device_count); +} + +inline double getMemoryFraction(c10::DeviceIndex device) { + return get()->getMemoryFraction(device); +} + +inline void setMemoryFraction(double fraction, c10::DeviceIndex device) { + return get()->setMemoryFraction(fraction, device); +} + +inline void emptyCache(MempoolId_t mempool_id = {0, 0}) { + return get()->emptyCache(mempool_id); +} + +inline void enable(bool value) { + return get()->enable(value); +} + +inline bool isEnabled() { + return get()->isEnabled(); +} + +inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) { + return get()->cacheInfo(device, largestBlock); +} + +inline void* getBaseAllocation(void* ptr, size_t* size) { + return get()->getBaseAllocation(ptr, size); +} + +inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats( + c10::DeviceIndex device) { + return get()->getDeviceStats(device); +} + +inline void resetAccumulatedStats(c10::DeviceIndex device) { + return get()->resetAccumulatedStats(device); +} + +inline void resetPeakStats(c10::DeviceIndex device) { + return get()->resetPeakStats(device); +} + +inline HIPCachingAllocator::SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) { + return get()->snapshot(mempool_id); +} + +inline std::shared_ptr getCheckpointState( + c10::DeviceIndex device, + MempoolId_t id) { + return get()->getCheckpointState(device, id); +} + +inline HIPCachingAllocator::CheckpointDelta setCheckpointPoolState( + c10::DeviceIndex device, + std::shared_ptr pps) { + return get()->setCheckpointPoolState(device, std::move(pps)); +} + +inline void beginAllocateToPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + std::function filter) { + get()->beginAllocateToPool(device, mempool_id, std::move(filter)); +} + +inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->endAllocateToPool(device, mempool_id); +} + +inline void recordHistory( + bool enabled, + HIPCachingAllocator::CreateContextFn context_recorder, + size_t alloc_trace_max_entries, + HIPCachingAllocator::RecordContext when, + bool clearHistory) { + return get()->recordHistory( + enabled, context_recorder, alloc_trace_max_entries, when, clearHistory); +} + +inline void recordAnnotation( + const std::vector>& md) { + return get()->recordAnnotation(md); +} + +inline void pushCompileContext(std::string& md) { + return get()->pushCompileContext(md); +} + +inline void popCompileContext() { + return get()->popCompileContext(); +} + +inline bool isHistoryEnabled() { + return get()->isHistoryEnabled(); +} + +inline bool checkPoolLiveAllocations( + c10::DeviceIndex device, + MempoolId_t mempool_id, + const std::unordered_set& expected_live_allocations) { + return get()->checkPoolLiveAllocations( + device, mempool_id, expected_live_allocations); +} + +inline void attachOutOfMemoryObserver(HIPCachingAllocator::OutOfMemoryObserver observer) { + return get()->attachOutOfMemoryObserver(std::move(observer)); +} + +inline void attachAllocatorTraceTracker(HIPCachingAllocator::AllocatorTraceTracker tracker) { + return get()->attachAllocatorTraceTracker(std::move(tracker)); +} + +inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) { + return get()->releasePool(device, mempool_id); +} + +inline void createOrIncrefPool( + c10::DeviceIndex device, + MempoolId_t mempool_id, + HIPCachingAllocator::HIPAllocator* allocator_ptr = nullptr) { + get()->createOrIncrefPool(device, mempool_id, allocator_ptr); +} + +inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->setUseOnOOM(device, mempool_id); +} + +inline void setNoSplit(c10::DeviceIndex device, MempoolId_t mempool_id) { + get()->setNoSplit(device, mempool_id); +} + +inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) { + return get()->getPoolUseCount(device, mempool_id); +} + +inline std::shared_ptr getIpcDevPtr(std::string handle) { + return get()->getIpcDevPtr(std::move(handle)); +} + +inline HIPCachingAllocator::ShareableHandle shareIpcHandle(void* ptr) { + return get()->shareIpcHandle(ptr); +} + +inline std::string name() { + return get()->name(); +} + +inline hipError_t memcpyAsync( + void* dst, + int dstDevice, + const void* src, + int srcDevice, + size_t count, + hipStream_t stream, + bool p2p_enabled) { + return get()->memcpyAsync( + dst, dstDevice, src, srcDevice, count, stream, p2p_enabled); +} + +inline void enablePeerAccess( + c10::DeviceIndex dev, + c10::DeviceIndex dev_to_access) { + return get()->enablePeerAccess(dev, dev_to_access); +} + +} // namespace HIPCachingAllocatorMasqueradingAsCUDA +} // namespace hip +} // namespace c10 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..be82a7c22e3f62e77ef9b9f232fe1f1ce864efa8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h @@ -0,0 +1,388 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +// The includes of HIPGuard.h +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +// Use of c10::hip namespace here makes hipification easier, because +// I don't have to also fix namespaces. Sorry! +namespace c10 { namespace hip { + +// Note [Masquerading as CUDA] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// c10_hip is very easy to understand: it is HIPified from c10_cuda, +// and anywhere you said CUDA, the source code now says HIP. HIPified +// PyTorch is much harder to understand: it is HIPified from regular +// PyTorch, yes, but NO source-to-source translation from CUDA to +// HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP". +// For example, when you use HIPified PyTorch, you say x.cuda() to +// move a tensor onto ROCm device. We call this situation "HIP +// masquerading as CUDA". +// +// This leads to a very awkward situation when we want to call c10_hip +// code from PyTorch, since c10_hip is expecting things to be called +// HIP, but PyTorch is calling them CUDA (masquerading as HIP). To +// fix this impedance mismatch, we have MasqueradingAsCUDA variants +// for all c10_hip classes. These translate between the "HIP" and "CUDA +// masquerading as HIP" worlds. For example, +// HIPGuardImplMasqueradingAsCUDA (this file) provides something like a +// HIPGuardImpl, but it reports its DeviceType as CUDA (e.g., type() +// returns CUDA, getDevice() reports the current HIP device as a CUDA +// device.) +// +// We should be able to delete all of these classes entirely once +// we switch PyTorch to calling a HIP a HIP. +// +// When you add a new MasqueradingAsCUDA class/function, you need to +// also update the rewrite rules in torch/utils/hipify/cuda_to_hip_mappings.py +// +// +// +// By the way, note that the cpp file associated with this also +// *overwrites* the entry in the DeviceGuardImpl registry for CUDA with +// this HIP implementation. + +struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplInterface { + static constexpr c10::DeviceType static_type = c10::DeviceType::CUDA; + HIPGuardImplMasqueradingAsCUDA() {} + HIPGuardImplMasqueradingAsCUDA(c10::DeviceType t) { + TORCH_INTERNAL_ASSERT(t == c10::DeviceType::CUDA); + } + c10::DeviceType type() const override { + return c10::DeviceType::CUDA; + } + Device exchangeDevice(Device d) const override { + TORCH_INTERNAL_ASSERT(d.is_cuda()); + Device old_device = getDevice(); + if (old_device.index() != d.index()) { + C10_HIP_CHECK(hipSetDevice(d.index())); + } + return old_device; + } + Device getDevice() const override { + int device; + C10_HIP_CHECK(hipGetDevice(&device)); + return Device(c10::DeviceType::CUDA, device); + } + void setDevice(Device d) const override { + TORCH_INTERNAL_ASSERT(d.is_cuda()); + C10_HIP_CHECK(hipSetDevice(d.index())); + } + void uncheckedSetDevice(Device d) const noexcept override { + C10_HIP_CHECK_WARN(hipSetDevice(d.index())); + } + Stream getStream(Device d) const override { + return getCurrentHIPStreamMasqueradingAsCUDA(d.index()).unwrap(); + } + Stream getDefaultStream(Device d) const override { + return getDefaultHIPStreamMasqueradingAsCUDA(d.index()); + } + Stream getNewStream(Device d, int priority = 0) const override { + return getStreamFromPoolMasqueradingAsCUDA(priority, d.index()); + } + Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) const override { + return getStreamFromPoolMasqueradingAsCUDA(isHighPriority, d.index()); + } + Stream exchangeStream(Stream s) const override { + HIPStreamMasqueradingAsCUDA cs(s); + auto old_stream = getCurrentHIPStreamMasqueradingAsCUDA(s.device().index()); + setCurrentHIPStreamMasqueradingAsCUDA(cs); + return old_stream.unwrap(); + } + DeviceIndex deviceCount() const noexcept override { + int deviceCnt; + hipError_t _err; + _err = hipGetDeviceCount(&deviceCnt); + if(_err != hipErrorNoDevice && _err != hipSuccess) + C10_HIP_CHECK(_err); + return deviceCnt; + } + + // Event-related functions + // Note: hipEventCreateWithFlags should be called on the same device as + // the recording stream's device. + void createEvent( + hipEvent_t* hip_event, + const EventFlag flag) const { + // Maps PyTorch's Event::Flag to HIP flag + auto hip_flag = hipEventDefault; + switch (flag) { + case EventFlag::PYTORCH_DEFAULT: + hip_flag = hipEventDisableTiming; + break; + case EventFlag::BACKEND_DEFAULT: + hip_flag = hipEventDefault; + break; + default: + TORCH_CHECK(false, "HIP event received unknown flag"); + } + + C10_HIP_CHECK(hipEventCreateWithFlags(hip_event, hip_flag)); + } + + void destroyEvent( + void* event, + const DeviceIndex device_index) const noexcept override { + if (!event) return; + auto hip_event = static_cast(event); + int orig_device; + C10_HIP_CHECK_WARN(hipGetDevice(&orig_device)); + C10_HIP_CHECK_WARN(hipSetDevice(device_index)); + C10_HIP_CHECK_WARN(hipEventDestroy(hip_event)); + C10_HIP_CHECK_WARN(hipSetDevice(orig_device)); + } + + void record(void** event, + const Stream& stream, + const DeviceIndex device_index, + const EventFlag flag) const override { + TORCH_CHECK(device_index == -1 || device_index == stream.device_index(), + "Event device index ", + device_index, + " does not match recording stream's device index ", + stream.device_index(), + "."); + + hipEvent_t hip_event = static_cast(*event); + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + + // Moves to stream's device to record + const auto orig_device = getDevice(); + setDevice(stream.device()); + + // Creates the event (lazily) + if (!hip_event) createEvent(&hip_event, flag); + C10_HIP_CHECK(hipEventRecord(hip_event, hip_stream)); + // Makes the void* point to the (possibly just allocated) HIP event + *event = hip_event; + + // Resets device + setDevice(orig_device); + } + + void block( + void* event, + const Stream& stream) const override { + if (!event) return; + hipEvent_t hip_event = static_cast(event); + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + const auto orig_device = getDevice(); + setDevice(stream.device()); + C10_HIP_CHECK(hipStreamWaitEvent( + hip_stream, + hip_event, + /*flags (must be zero)=*/ 0)); + setDevice(orig_device); + } + + bool queryEvent(void* event) const override { + if (!event) return true; + hipEvent_t hip_event = static_cast(event); + const hipError_t err = hipEventQuery(hip_event); + if (err != hipErrorNotReady) C10_HIP_CHECK(err); + else { + // ignore and clear the error if not ready + (void)hipGetLastError(); + } + return (err == hipSuccess); + } + + // Stream-related functions + bool queryStream(const Stream& stream) const override { + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + return hip_stream.query(); + } + + void synchronizeStream(const Stream& stream) const override { + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + hip_stream.synchronize(); + } + + void synchronizeEvent(void* event) const override { + if (!event) + return; + hipEvent_t hip_event = static_cast(event); + C10_HIP_CHECK(hipEventSynchronize(hip_event)); + } + + // Note: synchronizeDevice can be safely called from any device + void synchronizeDevice(const c10::DeviceIndex device_index) const override { + int orig_device{-1}; + C10_HIP_CHECK(hipGetDevice(&orig_device)); + C10_HIP_CHECK(hipSetDevice(device_index)); + C10_HIP_CHECK(hipDeviceSynchronize()); + C10_HIP_CHECK(hipSetDevice(orig_device)); + } + + void recordDataPtrOnStream( + const c10::DataPtr& data_ptr, + const Stream& stream) const override { + HIPStreamMasqueradingAsCUDA hip_stream{stream}; + HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(data_ptr, hip_stream); + } + + double elapsedTime(void* event1, void* event2, const DeviceIndex device_index) + const override { + TORCH_CHECK( + event1 && event2, + "Both events must be recorded before calculating elapsed time."); + int orig_device; + C10_HIP_CHECK(hipGetDevice(&orig_device)); + C10_HIP_CHECK(hipSetDevice(device_index)); + hipEvent_t hip_event1 = static_cast(event1); + hipEvent_t hip_event2 = static_cast(event2); + float time_ms = 0; + // raise hipErrorNotReady if either event is recorded but not yet completed + C10_HIP_CHECK(hipEventElapsedTime(&time_ms, hip_event1, hip_event2)); + C10_HIP_CHECK(hipSetDevice(orig_device)); + return static_cast(time_ms); + } +}; + +// All of the guards which have HIPGuardImpl burned in need to also have +// variants using HIPGuardImplMasqueradingAsCUDA. + +/// This code is all a direct copy from c10/cuda/HIPGuardMasqueradingAsCUDA.h, but with +/// the correct InlineDeviceGuard burned in. Sorry about the +/// copy-pasting. + +struct HIPGuardMasqueradingAsCUDA { + explicit HIPGuardMasqueradingAsCUDA() = delete; + explicit HIPGuardMasqueradingAsCUDA(DeviceIndex device_index) : guard_(device_index) {} + explicit HIPGuardMasqueradingAsCUDA(Device device) : guard_(device) {} + + HIPGuardMasqueradingAsCUDA(const HIPGuardMasqueradingAsCUDA&) = delete; + HIPGuardMasqueradingAsCUDA& operator=(const HIPGuardMasqueradingAsCUDA&) = delete; + HIPGuardMasqueradingAsCUDA(HIPGuardMasqueradingAsCUDA&& other) = delete; + HIPGuardMasqueradingAsCUDA& operator=(HIPGuardMasqueradingAsCUDA&& other) = delete; + + void set_device(Device device) { guard_.set_device(device); } + void reset_device(Device device) { guard_.reset_device(device); } + void set_index(DeviceIndex device_index) { guard_.set_index(device_index); } + Device original_device() const { return guard_.original_device(); } + Device current_device() const { return guard_.current_device(); } + + private: + c10::impl::InlineDeviceGuard guard_; +}; + +struct OptionalHIPGuardMasqueradingAsCUDA { + explicit OptionalHIPGuardMasqueradingAsCUDA() : guard_() {} + explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional device_opt) : guard_(device_opt) {} + explicit OptionalHIPGuardMasqueradingAsCUDA(std::optional device_index_opt) : guard_(device_index_opt) {} + + OptionalHIPGuardMasqueradingAsCUDA(const OptionalHIPGuardMasqueradingAsCUDA&) = delete; + OptionalHIPGuardMasqueradingAsCUDA& operator=(const OptionalHIPGuardMasqueradingAsCUDA&) = delete; + OptionalHIPGuardMasqueradingAsCUDA(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete; + OptionalHIPGuardMasqueradingAsCUDA& operator=(OptionalHIPGuardMasqueradingAsCUDA&& other) = delete; + + void set_device(Device device) { guard_.set_device(device); } + void reset_device(Device device) { guard_.reset_device(device); } + void set_index(DeviceIndex device_index) { guard_.set_index(device_index); } + std::optional original_device() const { return guard_.original_device(); } + std::optional current_device() const { return guard_.current_device(); } + void reset() { guard_.reset(); } + +private: + c10::impl::InlineOptionalDeviceGuard guard_; +}; + +struct HIPStreamGuardMasqueradingAsCUDA { + explicit HIPStreamGuardMasqueradingAsCUDA() = delete; + explicit HIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {} + HIPStreamGuardMasqueradingAsCUDA(const HIPStreamGuardMasqueradingAsCUDA&) = delete; + HIPStreamGuardMasqueradingAsCUDA& operator=(const HIPStreamGuardMasqueradingAsCUDA&) = delete; + HIPStreamGuardMasqueradingAsCUDA(HIPStreamGuardMasqueradingAsCUDA&& other) = delete; + HIPStreamGuardMasqueradingAsCUDA& operator=(HIPStreamGuardMasqueradingAsCUDA&& other) = delete; + + void reset_stream(Stream stream) { guard_.reset_stream(stream); } + + HIPStreamMasqueradingAsCUDA original_stream() const { + return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.original_stream()); + } + HIPStreamMasqueradingAsCUDA current_stream() const { + return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, guard_.current_stream()); + } + + Device current_device() const { return guard_.current_device(); } + Device original_device() const { return guard_.original_device(); } + +private: + c10::impl::InlineStreamGuard guard_; +}; + +struct OptionalHIPStreamGuardMasqueradingAsCUDA { + explicit OptionalHIPStreamGuardMasqueradingAsCUDA() : guard_() {} + explicit OptionalHIPStreamGuardMasqueradingAsCUDA(Stream stream) : guard_(stream) {} + explicit OptionalHIPStreamGuardMasqueradingAsCUDA(std::optional stream_opt) : guard_(stream_opt) {} + + OptionalHIPStreamGuardMasqueradingAsCUDA(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete; + OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(const OptionalHIPStreamGuardMasqueradingAsCUDA&) = delete; + OptionalHIPStreamGuardMasqueradingAsCUDA(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete; + OptionalHIPStreamGuardMasqueradingAsCUDA& operator=(OptionalHIPStreamGuardMasqueradingAsCUDA&& other) = delete; + + void reset_stream(Stream stream) { guard_.reset_stream(stream); } + + std::optional original_stream() const { + auto r = guard_.original_stream(); + if (r.has_value()) { + return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value()); + } else { + return std::nullopt; + } + } + + std::optional current_stream() const { + auto r = guard_.current_stream(); + if (r.has_value()) { + return HIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA::UNCHECKED, r.value()); + } else { + return std::nullopt; + } + } + + void reset() { guard_.reset(); } + +private: + c10::impl::InlineOptionalStreamGuard guard_; +}; + +struct HIPMultiStreamGuardMasqueradingAsCUDA { + explicit HIPMultiStreamGuardMasqueradingAsCUDA(ArrayRef streams) + : guard_(unwrapStreams(streams)) {} + + HIPMultiStreamGuardMasqueradingAsCUDA(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete; + HIPMultiStreamGuardMasqueradingAsCUDA& operator=(const HIPMultiStreamGuardMasqueradingAsCUDA&) = delete; + HIPMultiStreamGuardMasqueradingAsCUDA(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete; + HIPMultiStreamGuardMasqueradingAsCUDA& operator=(HIPMultiStreamGuardMasqueradingAsCUDA&& other) = delete; + +private: + c10::impl::InlineMultiStreamGuard guard_; + + static std::vector unwrapStreams(ArrayRef hipStreams) { + std::vector streams; + streams.reserve(hipStreams.size()); + for (const HIPStreamMasqueradingAsCUDA& hipStream : hipStreams) { + streams.push_back(hipStream); + } + return streams; + } +}; + +}} // namespace c10::hip + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h new file mode 100644 index 0000000000000000000000000000000000000000..48f1459396b82283290e457a16f9ec66ee500601 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h @@ -0,0 +1,140 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +// Use of c10::hip namespace here makes hipification easier, because +// I don't have to also fix namespaces. Sorry! +namespace c10 { namespace hip { + +// See Note [Masquerading as CUDA] for motivation + +class HIPStreamMasqueradingAsCUDA { +public: + + enum Unchecked { UNCHECKED }; + + explicit HIPStreamMasqueradingAsCUDA(Stream stream) + : HIPStreamMasqueradingAsCUDA(UNCHECKED, stream) { + // We did the coercion unchecked; check that it was right. + TORCH_CHECK(stream.device().is_cuda() /* !!! */); + } + + explicit HIPStreamMasqueradingAsCUDA(Unchecked, Stream stream) + // Unsafely coerce the "CUDA" stream into a HIP stream + : stream_( + HIPStream( + Stream( + Stream::UNSAFE, + Device(c10::DeviceType::HIP, stream.device_index()), + stream.id()) + ) + ) {} + + // New constructor, just for this. Does NOT coerce. + explicit HIPStreamMasqueradingAsCUDA(HIPStream stream) : stream_(stream) {} + + bool operator==(const HIPStreamMasqueradingAsCUDA& other) const noexcept { + return stream_ == other.stream_; + } + + bool operator!=(const HIPStreamMasqueradingAsCUDA& other) const noexcept { + return stream_ != other.stream_; + } + + operator hipStream_t() const { return stream_.stream(); } + + operator Stream() const { + // Unsafely coerce HIP stream into a "CUDA" stream + return Stream(Stream::UNSAFE, device(), id()); + } + + DeviceIndex device_index() const { return stream_.device_index(); } + + // Unsafely coerce HIP device into CUDA device + c10::DeviceType device_type() const { return c10::DeviceType::CUDA; } + + Device device() const { + // Unsafely coerce HIP device into CUDA device + return Device(c10::DeviceType::CUDA, stream_.device_index()); + } + + StreamId id() const { return stream_.id(); } + bool query() const { return stream_.query(); } + void synchronize() const { stream_.synchronize(); } + int priority() const { return stream_.priority(); } + hipStream_t stream() const { return stream_.stream(); } + + Stream unwrap() const { + // Unsafely coerce HIP stream into "CUDA" stream + return Stream(Stream::UNSAFE, device(), id()); + } + + c10::StreamData3 pack3() const noexcept { + // Unsafely coerce HIP stream into "CUDA" stream before packing + return unwrap().pack3(); + } + + static HIPStreamMasqueradingAsCUDA unpack3(StreamId stream_id, + DeviceIndex device_index, + c10::DeviceType device_type) { + // NB: constructor manages CUDA->HIP translation for us + return HIPStreamMasqueradingAsCUDA(Stream::unpack3( + stream_id, device_index, device_type)); + } + + static std::tuple priority_range() { return HIPStream::priority_range(); } + + // New method, gets the underlying HIPStream + HIPStream hip_stream() const { return stream_; } + +private: + HIPStream stream_; +}; + +HIPStreamMasqueradingAsCUDA +inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, DeviceIndex device = -1) { + return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device)); +} + +HIPStreamMasqueradingAsCUDA +inline getStreamFromPoolMasqueradingAsCUDA(const int priority, DeviceIndex device = -1) { + return HIPStreamMasqueradingAsCUDA(getStreamFromPool(priority, device)); +} + +HIPStreamMasqueradingAsCUDA +inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) { + return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device)); +} + +inline HIPStreamMasqueradingAsCUDA getDefaultHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) { + return HIPStreamMasqueradingAsCUDA(getDefaultHIPStream(device_index)); +} + +inline HIPStreamMasqueradingAsCUDA getCurrentHIPStreamMasqueradingAsCUDA(DeviceIndex device_index = -1) { + return HIPStreamMasqueradingAsCUDA(getCurrentHIPStream(device_index)); +} + +inline void setCurrentHIPStreamMasqueradingAsCUDA(HIPStreamMasqueradingAsCUDA stream) { + setCurrentHIPStream(stream.hip_stream()); +} + +inline std::ostream& operator<<(std::ostream& stream, const HIPStreamMasqueradingAsCUDA& s) { + stream << s.hip_stream() << " (masquerading as CUDA)"; + return stream; +} + +}} // namespace c10::hip + +namespace std { + template <> + struct hash { + size_t operator()(c10::hip::HIPStreamMasqueradingAsCUDA s) const noexcept { + return std::hash{}(s.unwrap()); + } + }; +} // namespace std + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Descriptors.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Descriptors.h new file mode 100644 index 0000000000000000000000000000000000000000..e0f972da4ea1a69db273756e32b04b9c09309729 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Descriptors.h @@ -0,0 +1,210 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include +#include + +namespace at { namespace native { + +std::string miopenTypeToString(miopenDataType_t dtype); + +inline int dataSize(miopenDataType_t dataType) +{ + switch (dataType) { + case miopenHalf: return 2; + case miopenFloat: return 4; + case miopenBFloat16: return 2; + default: return 8; + } +} + +// See NOTE [ cudnn fixSizeOneDimStride ] in aten/src/ATen/cudnn/Descriptors.h +template +static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) { + int64_t z = 1; + int index = 0; + std::vector permutation(dim); + + if (nhwc) { + permutation[index++] = 1; + } + for (int d = dim-1; d > 1; d--) { + permutation[index++] = d; + } + if (!nhwc) { + permutation[index++] = 1; + } + permutation[index++] = 0; + for (int d : permutation) { + if (size[d] == 1) { + stride[d] = z; + } else { + z *= size[d]; + } + } +} + +template +struct DescriptorDeleter { + void operator()(T* x) { + if (x != nullptr) { + MIOPEN_CHECK(dtor(x)); + } + } +}; + +// A generic class for wrapping MIOpen descriptor types. All you need +// is to give the underlying type the Descriptor_t points to (usually, +// if it's miopenTensorDescriptor_t it points to miopenTensorStruct), +// the constructor and the destructor. Subclasses are responsible +// for defining a set() function to actually set the descriptor. +// +// Descriptors default construct to a nullptr, and have a descriptor +// initialized the first time you call set() or any other initializing +// function. +template +// NOLINTNEXTLINE(bugprone-exception-escape) +class TORCH_HIP_CPP_API Descriptor { + public: + // Use desc() to access the underlying descriptor pointer in + // a read-only fashion. Most client code should use this. + // If the descriptor was never initialized, this will return + // nullptr. + T* desc() const { return desc_.get(); } + T* desc() { return desc_.get(); } + + // Use mut_desc() to access the underlying descriptor pointer + // if you intend to modify what it points to (e.g., using + // miopenSetFooDescriptor). This will ensure that the descriptor + // is initialized. Code in this file will use this function. + T* mut_desc() { init(); return desc_.get(); } +protected: + void init() { + if (desc_ == nullptr) { + T* raw_desc = nullptr; + MIOPEN_CHECK(ctor(&raw_desc)); + desc_.reset(raw_desc); + } + } +private: + std::unique_ptr> desc_; +}; + +class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor< + miopenTensorDescriptor, + &miopenCreateTensorDescriptor, + &miopenDestroyTensorDescriptor> { + public: + TensorDescriptor() = default; + explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) { + set(t, pad); + } + + // See Note [CuDNN broadcast padding] + void set(const at::Tensor &t, size_t pad = 0); + void set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad = 0); + void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0); + + void print(); + +private: + void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc); + + void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) { + std::vector strides_copy(stride, stride + dim); + fixSizeOneDimStride(dim, size, strides_copy.data(), nhwc); + MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data())); + } +}; + +std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d); + +class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor< + miopenTensorDescriptor, + &miopenCreateTensorDescriptor, + &miopenDestroyTensorDescriptor> { + public: + void set(const at::Tensor &t, int64_t pad = 0) { + set(t, at::MemoryFormat::Contiguous, pad); + } + + void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0); + +private: + void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) { + std::vector strides_copy(stride, stride + dim); + fixSizeOneDimStride(dim, size, strides_copy.data(), nhwc); + MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data())); + } +}; + +struct TORCH_HIP_CPP_API ConvolutionDescriptor + : public Descriptor< + miopenConvolutionDescriptor, + &miopenCreateConvolutionDescriptor, + &miopenDestroyConvolutionDescriptor> { + void set(miopenDataType_t dataType, miopenConvolutionMode_t c_mode, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups, bool benchmark, bool deterministic) { + MIOPEN_CHECK(miopenInitConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale, c_mode)); + MIOPEN_CHECK(miopenSetConvolutionGroupCount(mut_desc(), groups)); + MIOPEN_CHECK(miopenSetConvolutionAttribute(mut_desc(), MIOPEN_CONVOLUTION_ATTRIB_DETERMINISTIC, deterministic ? 1 : 0)); + if (benchmark) { + MIOPEN_CHECK(miopenSetConvolutionFindMode(mut_desc(), miopenConvolutionFindModeNormal)); + } + } +}; + +// NOLINTNEXTLINE(bugprone-exception-escape) +struct TORCH_HIP_CPP_API DropoutDescriptor + : public Descriptor< + miopenDropoutDescriptor, + &miopenCreateDropoutDescriptor, + &miopenDestroyDropoutDescriptor> { + void set(miopenHandle_t handle, float dropout, void* states, size_t stateSizeInBytes, + unsigned long long seed, bool use_mask, bool state_evo, miopenRNGType_t rng_mode) { + MIOPEN_CHECK(miopenSetDropoutDescriptor(mut_desc(), handle, dropout, states, stateSizeInBytes, seed, use_mask, state_evo, rng_mode)); + } + + void restore(miopenHandle_t handle, float dropout, void* states, size_t stateSizeInBytes, + unsigned long long seed, bool use_mask, bool state_evo, miopenRNGType_t rng_mode) { + MIOPEN_CHECK(miopenRestoreDropoutDescriptor(mut_desc(), handle, dropout, states, stateSizeInBytes, seed, use_mask, state_evo, rng_mode)); + } +}; + +struct TORCH_HIP_CPP_API RNNDescriptor + : public Descriptor +{ + void set(int64_t hidden_size, int64_t num_layers, miopenRNNInputMode_t input_mode, miopenRNNDirectionMode_t direction, miopenRNNMode_t rnn_mode, + miopenRNNBiasMode_t bias_mode, miopenRNNAlgo_t algorithm, miopenDataType_t datatype) { + MIOPEN_CHECK(miopenSetRNNDescriptor(mut_desc(), hidden_size, num_layers, input_mode, direction, rnn_mode, bias_mode, algorithm, datatype)); + } + + void setWithDropout(DropoutDescriptor& dropout_desc, int64_t hidden_size, int64_t num_layers, miopenRNNInputMode_t input_mode, miopenRNNDirectionMode_t direction, + miopenRNNMode_t rnn_mode, miopenRNNBiasMode_t bias_mode, miopenRNNAlgo_t algorithm, miopenDataType_t datatype) { + MIOPEN_CHECK(miopenSetRNNDescriptor_V2(mut_desc(), hidden_size, num_layers, dropout_desc.mut_desc(), input_mode, direction, rnn_mode, bias_mode, algorithm, datatype)); + } +}; + +union Constant +{ + float f; + double d; + Constant(miopenDataType_t dataType, double value) { + if (dataType == miopenHalf || dataType == miopenFloat || dataType == miopenBFloat16) { + f = static_cast(value); + } else { + d = value; + } + } +}; + +}} // namespace + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Exceptions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Exceptions.h new file mode 100644 index 0000000000000000000000000000000000000000..c7bc662c92c808373571254e524fde9a5c7aaadc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Exceptions.h @@ -0,0 +1,46 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at { namespace native { + +class miopen_exception : public std::runtime_error { +public: + miopenStatus_t status; + miopen_exception(miopenStatus_t status, const char* msg) + : std::runtime_error(msg) + , status(status) {} + miopen_exception(miopenStatus_t status, const std::string& msg) + : std::runtime_error(msg) + , status(status) {} +}; + +inline void MIOPEN_CHECK(miopenStatus_t status) +{ + if (status != miopenStatusSuccess) { + if (status == miopenStatusNotImplemented) { + throw miopen_exception(status, std::string(miopenGetErrorString(status)) + + ". This error may appear if you passed in a non-contiguous input."); + } + throw miopen_exception(status, miopenGetErrorString(status)); + } +} + +inline void HIP_CHECK(hipError_t error) +{ + if (error != hipSuccess) { + std::string msg("HIP error: "); + msg += hipGetErrorString(error); + throw std::runtime_error(msg); + } +} + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Handle.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Handle.h new file mode 100644 index 0000000000000000000000000000000000000000..f5a3577f06a7a5f8416dadc1188f4882de306ae9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Handle.h @@ -0,0 +1,14 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +TORCH_HIP_CPP_API miopenHandle_t getMiopenHandle(); +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Types.h new file mode 100644 index 0000000000000000000000000000000000000000..98423302b3479e383e50d05f60c37041cc139b89 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Types.h @@ -0,0 +1,18 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +TORCH_HIP_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor); + +int64_t miopen_version(); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Utils.h new file mode 100644 index 0000000000000000000000000000000000000000..790aaf5b11c0df8c1f17ccb3ca51930f248cdcea --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/Utils.h @@ -0,0 +1,23 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at { namespace native { + +// This function makes tensors which have zero stride contiguous, by +// setting the strides to 1. +inline Tensor contiguousIfZeroInStrides(const Tensor& t) { + for (auto s : t.strides()) { + if (s == 0) return t.contiguous(); + } + return t; +} + +}} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/miopen-wrapper.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/miopen-wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..8b76fb0e6fe92b2033c6d8ba8caf006d29f767b1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/miopen/miopen-wrapper.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#if MIOPEN_VERSION_MAJOR > 3 || (MIOPEN_VERSION_MAJOR == 3 && MIOPEN_VERSION_MINOR >= 4) +// miopen 3.4 moved find mode from private header to public header +#else +// from miopen_internal.h +extern "C" { + +typedef enum +{ + miopenConvolutionFindModeNormal = 1, /*!< Normal mode */ +} miopenConvolutionFindMode_t; + +miopenStatus_t miopenSetConvolutionFindMode( + miopenConvolutionDescriptor_t convDesc, + miopenConvolutionFindMode_t findMode); +} +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/EmptyTensor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/EmptyTensor.h new file mode 100644 index 0000000000000000000000000000000000000000..3507c0e17afd44189a5a69e3bc216b10a61dd626 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/EmptyTensor.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once +#include + +namespace at::detail { + +C10_EXPORT TensorBase empty_mps( + IntArrayRef size, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); +C10_EXPORT TensorBase empty_mps(IntArrayRef size, const TensorOptions& options); + +C10_EXPORT TensorBase empty_strided_mps( + IntArrayRef size, + IntArrayRef stride, + ScalarType dtype, + std::optional device_opt); + +C10_EXPORT TensorBase empty_strided_mps( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions& options); + +} // namespace at::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/IndexKernels.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/IndexKernels.h new file mode 100644 index 0000000000000000000000000000000000000000..be3ad0b5c05a9d98cf1ef253e0fe69b7816d4af1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/IndexKernels.h @@ -0,0 +1,225 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at::mps { + +static const char* SCATTER_OPS_TEMPLATE = R"METAL_SCATTER( +template +Y cast(const X x); + +template<> +{1} cast<{1}, {0}>(const {0} x) {{ + return {2}; +}} + +kernel void scatter_kernel_n(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant uint32_t * size [[buffer(2)]], + constant uint32_t * stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]], + constant int32_t & ndim [[buffer(5)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + uint64_t dst_offs = 0; + auto dst_idx = linear_index; + for(int dim = ndim - 1; dim >= 0; --dim) {{ + dst_offs += stride[dim] * (dst_idx % size[dim]); + dst_idx /= size[dim]; + }} + + dst[dst_offs] = cast<{1}>(src[linear_index]); +}} + +kernel void scatter_kernel_4(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint4 & size [[buffer(2)]], + constant packed_uint4 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint4 local_index; + local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0]; + local_index.y = linear_index / (size[3] * size[2]) % size[1]; + local_index.z = linear_index / size[3] % size[2]; + local_index.w = linear_index % size[3]; + + const packed_uint4 strided_index = local_index * stride; + dst[strided_index.x + strided_index.y + strided_index.z + strided_index.w] = cast<{1}>(src[linear_index]); +}} + +kernel void scatter_kernel_3(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint3 & size [[buffer(2)]], + constant packed_uint3 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint3 local_index; + local_index.x = linear_index / (size[2] * size[1]) % size[0]; + local_index.y = linear_index / size[2] % size[1]; + local_index.z = linear_index % size[2]; + + const packed_uint3 strided_index = local_index * stride; + dst[strided_index.x + strided_index.y + strided_index.z] = cast<{1}>(src[linear_index]); +}} + +kernel void scatter_kernel_2(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint2 & size [[buffer(2)]], + constant packed_uint2 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint2 local_index; + local_index.x = linear_index / size[1] % size[0]; + local_index.y = linear_index % size[1]; + + const packed_uint2 strided_index = local_index * stride; + dst[strided_index.x + strided_index.y] = cast<{1}>(src[linear_index]); +}} + +kernel void scatter_kernel_1(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant int & size [[buffer(2)]], + constant int & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + const int local_index = linear_index % size; + const int strided_index = local_index * stride; + dst[strided_index] = cast<{1}>(src[linear_index]); +}} +)METAL_SCATTER"; + +static const char* GATHER_OPS_TEMPLATE = R"METAL_GATHER( +template +Y cast(const X x); + +template<> +{1} cast<{1}, {0}>(const {0} x) {{ + return {2}; +}} + +kernel void gather_kernel_n(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant uint32_t * size [[buffer(2)]], + constant uint32_t * stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]], + constant int32_t & ndim [[buffer(5)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + uint64_t src_offs = 0; + auto src_idx = linear_index; + for(int dim = ndim - 1; dim >= 0; --dim) {{ + src_offs += stride[dim] * (src_idx % size[dim]); + src_idx /= size[dim]; + }} + + dst[linear_index] = cast<{1}>(src[src_offs]); +}} + +kernel void gather_kernel_4(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint4 & size [[buffer(2)]], + constant packed_uint4 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint4 local_index; + local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0]; + local_index.y = linear_index / (size[3] * size[2]) % size[1]; + local_index.z = linear_index / size[3] % size[2]; + local_index.w = linear_index % size[3]; + + const packed_uint4 strided_index = local_index * stride; + dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z + strided_index.w]); +}} + +kernel void gather_kernel_3(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint3 & size [[buffer(2)]], + constant packed_uint3 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint3 local_index; + local_index.x = linear_index / (size[2] * size[1]) % size[0]; + local_index.y = linear_index / size[2] % size[1]; + local_index.z = linear_index % size[2]; + + const packed_uint3 strided_index = local_index * stride; + dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z]); +}} + +kernel void gather_kernel_2(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint2 & size [[buffer(2)]], + constant packed_uint2 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint2 local_index; + local_index.x = linear_index / size[1] % size[0]; + local_index.y = linear_index % size[1]; + + const packed_uint2 strided_index = local_index * stride; + dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y]); +}} + +kernel void gather_kernel_1(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant int & size [[buffer(2)]], + constant int & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + const int local_index = linear_index % size; + const int strided_index = local_index * stride; + dst[linear_index] = cast<{1}>(src[strided_index]); +}} +)METAL_GATHER"; +} // namespace at::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSAllocator.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..f295d43e1e5acec8c55d03ba4e3a62478567e63d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSAllocator.h @@ -0,0 +1,442 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// this implementation is based on CUDACachingAllocator. +// It utilizes Metal Heaps to improve the performance with buffer allocation. +// Do not include this header. Use MPSAllocatorInterface.h instead. +// TODO: Unify the logic with CUDACachingAllocator and remove redundant code. +namespace at::mps::HeapAllocator { + +static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB +static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap +static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB +static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps +static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps +static const size_t kXLargeHeapD = + MB(128); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps +static const size_t kXLargeHeapU = + MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps +static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation + +// buffer pools could be customized with a combination of usage flags +enum UsageFlags : uint32_t { + PRIVATE = 0, + SMALL = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap + SHARED = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device + MANAGED = (1 << 2), // managed storage mode + HAZARD = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool + SCALAR = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream +}; +// debug verbosity flags +enum DebugVerbosity : uint32_t { + SILENT = 0, + PROFILING = (1 << 0), // print generic profiling data for total system memory usage + ALLOCATIONS = (1 << 1), // print buffer allocations + RECYCLES = (1 << 2), // print buffer recycling + RELEASES = (1 << 3), // print buffer releases + LARGE_ONLY = (1 << 4), // only log large buffer pool transactions +}; + +struct HeapBlock; + +struct BufferBlock { + id buffer; + void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer + size_t size; // size after alignment + size_t requested_size; // requested size (before alignment) + // buffer shape is used for retrieving base of views in cached graphs + std::vector shape; + bool in_use = false; + HeapBlock* heap; + id_t buf_id; + // counter to candidate least recently used buffers for garbage collection + uint32_t gc_count = 0; + uint32_t use_count = 0; + // counter to assign unique ids to buffer blocks + static uint64_t buffer_counter; + // Metal events used to sync GPU/CPU operations on the shared-storage buffers + MPSEventPtr event; + + BufferBlock(size_t Size, size_t RequestedSize = 0, const id Buffer = nullptr, HeapBlock* Heap = nullptr) + : buffer(Buffer), size(Size), requested_size(RequestedSize), heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) {} + + static bool Comparator(const BufferBlock* a, const BufferBlock* b) { + return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer; + } + static size_t alignUp(size_t Size, size_t Alignment) { + assert(((Alignment - 1) & Alignment) == 0); + return ((Size + Alignment - 1) & ~(Alignment - 1)); + } + uint32_t retainCount() const { + return [buffer retainCount]; + } +}; +typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*); + +struct BufferPool; +struct AllocParams { + AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) + : search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) {} + size_t size() const { + return search_key.size; + } + + BufferBlock search_key; + BufferPool* pool; + BufferBlock* buffer_block = nullptr; + size_t requested_size; + // true if we exceed the low watermark limit. In this case + // we apply strategies to relieve the pressure before allocation. + bool has_memory_pressure = false; + // true if we're allocating on a unified memory device + bool has_unified_memory = true; +}; + +struct HeapBlock { + id heap; + struct { + size_t total, available; + } size; + BufferPool* pool; + unsigned int n_buffers = 0; + id_t heap_id; + // indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer) + bool is_split; + // counter to assign unique ids to heap blocks + static uint64_t heap_counter; + + HeapBlock(size_t Size, const id Heap = nullptr, BufferPool* Pool = nullptr) + : heap(Heap), + size({.total = Size, .available = Size}), + pool(Pool), + heap_id(Heap ? ++heap_counter : 0), + is_split(true) {} + + static MTLResourceOptions getOptions(uint32_t usage) { + // TODO: check the caching performance of write-combined mode + MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache; + + if (usage & UsageFlags::MANAGED) + options |= MTLResourceStorageModeManaged; + else if (usage & UsageFlags::SHARED) + options |= MTLResourceStorageModeShared; + else + options |= MTLResourceStorageModePrivate; + + options |= + (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked; + + return options; + } + + static HeapBlock* createHeapBlock(AllocParams& params, id device, uint32_t usage) { + HeapBlock* heapBlock = nullptr; + bool is_split = true; + const size_t size = params.size(); + MTLHeapDescriptor* d = [MTLHeapDescriptor new]; + if (d) { + const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD; + if (size <= kMaxSmallAlloc) { + d.size = kSmallHeap; + } else if (size < kMinLargeAlloc) { + d.size = kLargeHeap; + } else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) { + d.size = kXLargeHeap; + } else { + d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge); + is_split = false; + } + d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate; + d.cpuCacheMode = MTLCPUCacheModeDefaultCache; + // this automatically handles Metal buffer access synchronizations at the + // cost of slightly lower performance. + d.hazardTrackingMode = + (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked; + d.resourceOptions = getOptions(usage); + d.type = MTLHeapTypeAutomatic; + id heap = [device newHeapWithDescriptor:d]; + if (heap) { + [heap setPurgeableState:MTLPurgeableStateNonVolatile]; + const size_t heap_size = heapAvailableSize(heap); + heapBlock = new HeapBlock(heap_size, heap, params.pool); + if (heapBlock) { + heapBlock->is_split = is_split; + } + } + [d release]; + } + return heapBlock; + } + static bool Comparator(const HeapBlock* a, const HeapBlock* b) { + return (a->size.available != b->size.available) ? a->size.available < b->size.available + : (uintptr_t)a->heap < (uintptr_t)b->heap; + } + static NSUInteger heapAvailableSize(id heap, size_t Alignment = vm_page_size) { + return [heap maxAvailableSizeWithAlignment:Alignment]; + } + NSUInteger Size() { + return [heap size]; + } + id newMTLBuffer(size_t length, uint32_t usage) { + id buf = [heap newBufferWithLength:length options:getOptions(usage)]; + if (buf) { + updateAvailableSize(); + n_buffers++; + } + return buf; + } + // returns the retainCount before releasing the buffer + uint32_t releaseMTLBuffer(id& buffer) { + const uint32_t retainCount = [buffer retainCount]; + [buffer release]; + buffer = nil; + updateAvailableSize(); + n_buffers--; + return retainCount; + } + // returns the retainCount before releasing the heap + uint32_t releaseMTLHeap() { + const uint32_t retainCount = [heap retainCount]; + TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty + [heap setPurgeableState:MTLPurgeableStateEmpty]; + [heap release]; + heap = nil; + size.available = 0; + return retainCount; + } + uint32_t retainCount() const { + return [heap retainCount]; + } + void updateAvailableSize() { + size.available = heapAvailableSize(heap); + } +}; +typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*); + +struct BufferPool { + enum class Kind { + PRIVATE_SMALL, + PRIVATE_LARGE, + SHARED_SMALL, + SHARED_LARGE, + SCALAR, + }; + + BufferPool(const id Device, uint32_t Usage) + : device(Device), usage(Usage), heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) {} + + const id device; + // usage flags to customize the pool for various purposes (see UsageFlags enum) + const uint32_t usage; + // total number of buffers in the pool + uint32_t n_buffers = 0; + // total allocations size on this pool + size_t allocated_size = 0; + // total memory available in the pool + size_t available_size = 0; + // list of heaps ordered by their "available" (not total) memory size + std::set heaps; + // list of only "available" buffers in the pool (i.e., buffers not in-use) + std::set available_buffers; + // list of buffers that are in a state of "limbo" where they've already been freed + // from PyTorch-side, but were not returned to pool due to still being + // in-use by command buffers with retainCount > 1. In this state, the buffer is + // neither ready to be recycled, nor could be returned to pool as available. + // These buffers will be returned to pool once the command buffer's + // completionHandler callbacks are called. + std::unordered_set buffers_pending_free; + // list of heaps pending size update + std::unordered_set heaps_pending_update; +}; + +class MPSHeapAllocatorImpl { + public: + explicit MPSHeapAllocatorImpl() + : m_device(at::mps::MPSDevice::getInstance()->device()), + m_max_buffer_size([m_device maxBufferLength]), + m_stream(getDefaultMPSStream()), + m_event_pool(getMPSEventPool()) { + init_allocator(); + } + ~MPSHeapAllocatorImpl() { + emptyCache(); + } + // interface exposed to at::Allocator + id malloc(size_t size, uint32_t usage); + // frees a buffer and returns it into buffer pool + void free(void* ptr); + // releases all the cached buffers and their associated heaps + void emptyCache(); + // free inactive buffers that are pending to be freed + void freeInactiveBuffers(); + // returns true if buffer was allocated from the shared pool + bool isSharedBuffer(const void* ptr); + // get the requested unaligned size of an MTLBuffer + ssize_t getUnalignedBufferSize(const void* ptr); + // set the shape of a base tensor from a view tensor + void setBufferShape(const void* ptr, const IntArrayRef& shape); + // retrieve the shape of a base tensor from a view tensor + IntArrayRef getBufferShape(const void* ptr); + // get the unique ID of the buffer + id_t getBufferId(const void* ptr); + // allocate a buffer from a specialized pool to import CPU scalars into GPU + id allocScalarBufferWithValue(void* value, size_t size); + // returns a CPU-mapping of the input buffer and its retainCount, + // if only it has Shared storage-mode and allocated on MPSAllocator + std::pair getSharedBufferPtr(const void* buffer); + // records events for a list of MTLBuffers (list is used to lock the mutex once) + // returns true if records any event (given if passed buffers exist and are shared-storage) + bool recordEvents(c10::ArrayRef buffers); + // waits for the event to signal the completion of GPU execution + // on the passed shared buffers (list is used to lock the mutex once) + // returns true if actually waited on any event + bool waitForEvents(c10::ArrayRef buffers); + // this indicates how far (in Megabytes) the current total allocations are from the + // low watermark limit which is used to detect if we're under memory pressure + // This returns zero if we've reached the low watermark limit + ssize_t getLowWatermarkValue(); + // (see m_low_watermark_ratio for description) + void setLowWatermarkRatio(double ratio); + // (see m_high_watermark_ratio for description) + void setHighWatermarkRatio(double ratio); + // (see m_low_watermark_limit for description) + size_t getLowWatermarkLimit() const { + return m_low_watermark_limit; + } + // (see m_max_total_allowed_size for description) + size_t getHighWatermarkLimit() const { + return m_max_total_allowed_size; + } + // (see m_total_allocated_memory for description) + size_t getTotalAllocatedMemory() const { + return m_total_allocated_memory; + } + // (see m_current_allocated_memory for description) + size_t getCurrentAllocatedMemory() const { + return m_current_allocated_memory; + } + // total GPU memory allocated in the process by Metal driver; including + // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl. + size_t getDriverAllocatedMemory() const { + return current_allocated_size(); + } + // recommended Max memory for Metal + size_t getRecommendedMaxMemory() const { + return max_device_size(); + } + // (see enum DebugVerbosity for description) + uint32_t getDebugVerbosity() const { + return m_debug_verbosity; + } + // returns the device that we allocate from + inline id Device() const { + return m_device; + } + + inline std::string format_size(uint64_t size) const; + + private: + // (see m_high_watermark_ratio for description) + constexpr static double default_high_watermark_ratio = 1.7; + // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize. + constexpr static double default_high_watermark_upper_bound = 2.0; + // (see m_low_watermark_ratio for description) + // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize + constexpr static double default_low_watermark_ratio_unified = 1.4; + constexpr static double default_low_watermark_ratio_discrete = 1.0; + + const id m_device; + std::recursive_mutex m_mutex; + // allocated buffers by device pointer + ska::flat_hash_map m_allocated_buffers; + // using a container for pools to simplify iterating them + ska::flat_hash_map> m_pools; + // total memory allocated by HeapAllocator (including blocks in pools) + size_t m_total_allocated_memory = 0; + // currently active memory allocations in use (i.e., blocks not in pools) + size_t m_current_allocated_memory = 0; + // max buffer size allowed by Metal + size_t m_max_buffer_size = 0; + // maximum total size allowed to be allocated + size_t m_max_total_allowed_size = 0; + // high watermark ratio is a hard limit for the total allowed allocations + // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs) + // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize) + // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize + // e.g., value 0.95 means we allocate up to 95% of recommended maximum + // allocation size; beyond that, the allocations would fail with OOM error. + double m_high_watermark_ratio; + // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark + // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit). + // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection) + // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum + // allocation size. + double m_low_watermark_ratio; + // low watermark size limit (in Bytes) at the time we initialize the allocator + size_t m_low_watermark_limit; + // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity + uint32_t m_debug_verbosity; + // default MPS stream + MPSStream* m_stream; + // we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator + std::shared_ptr m_event_pool; + + void init_allocator(); + void init_buffer_pools(); + HeapBlock* get_free_heap(AllocParams& params); + bool get_free_buffer(AllocParams& params); + BufferBlock* get_allocated_buffer_block(const void* ptr); + BufferBlock* alloc_buffer_block(size_t size, uint32_t usage); + bool alloc_buffer(AllocParams& params); + void free_buffer(BufferBlock* buffer_block); + // returns true if the container heap is also released + bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true); + void release_buffers(BufferPool& pool); + bool release_available_cached_buffers(AllocParams& params); + bool release_cached_buffers(); + // free unused cached blocks to reclaim GPU memory if memory pressure is high + void garbage_collect_cached_buffers(AllocParams& params); + // returns the suitable buffer pool type for the usage or + // requested/allocated sizes + BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage); + // returns the aligned allocation size that is optimized + // for the buffers to get reused frequently + size_t get_allocation_size(size_t size, uint32_t usage) const; + // maximum size of device memory available for allocation in current process + // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory. + size_t max_device_size() const { + return [m_device recommendedMaxWorkingSetSize]; + } + // there are implicit allocations from MPS backend, so we need to query the 'device' for + // total allocated size instead of manually tracking in MPSAllocator + size_t current_allocated_size() const { + return [m_device currentAllocatedSize]; + } + + bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const { + for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) { + MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback( + buffer_block ? buffer_block->buffer : nullptr, event); + } + return true; + } +}; + +} // namespace at::mps::HeapAllocator + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h new file mode 100644 index 0000000000000000000000000000000000000000..cf8de460db3c678225e92d6733a6975a4df7a11c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h @@ -0,0 +1,73 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2023 Apple Inc. + +#pragma once + +#include +#include +#include + +#define MB(x) (x * 1048576UL) + +namespace at::mps { + +// this is a public interface to access MPSAllocator. +// Do not declare methods that would depend on MPS or Metal frameworks. +class IMPSAllocator : public c10::Allocator { + public: + // see the comments in MPSAllocator.h for the description of these methods. + virtual void emptyCache() const = 0; + virtual void freeInactiveBuffers() const = 0; + virtual ssize_t getUnalignedBufferSize(const void* ptr) const = 0; + virtual IntArrayRef getBufferShape(const void* ptr) const = 0; + virtual id_t getBufferId(const void* ptr) const = 0; + virtual void setBufferShape(const void* ptr, const IntArrayRef& shape) + const = 0; + virtual bool isSharedBuffer(const void* ptr) const = 0; + virtual bool isSharedStorageSupported() const = 0; + virtual c10::DataPtr allocScalarBufferWithValue(void* value, size_t size) + const = 0; + virtual std::string formatSize(size_t size) const = 0; + virtual void setLowWatermarkRatio(double ratio) const = 0; + virtual void setHighWatermarkRatio(double ratio) const = 0; + virtual ssize_t getLowWatermarkValue() const = 0; + virtual size_t getLowWatermarkLimit() const = 0; + virtual size_t getHighWatermarkLimit() const = 0; + virtual size_t getTotalAllocatedMemory() const = 0; + virtual size_t getCurrentAllocatedMemory() const = 0; + virtual size_t getDriverAllocatedMemory() const = 0; + virtual size_t getRecommendedMaxMemory() const = 0; + virtual std::pair getSharedBufferPtr( + const void* ptr) const = 0; + virtual bool recordEvents(c10::ArrayRef buffers) const = 0; + virtual bool waitForEvents(c10::ArrayRef buffers) const = 0; +}; + +class IMpsAllocatorCallback { + public: + enum class EventType { + ALLOCATED, // buffer got allocated to be used immediately + RECYCLED, // buffer pulled from free list to be reused + FREED, // buffer put to free list for future recycling + RELEASED, // buffer memory released + ALLOCATION_FAILED // buffer allocation failed + }; + virtual ~IMpsAllocatorCallback() = default; + virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0; +}; + +// MPS allocator will execute every registered callback when a block of memory +// is freed. +TORCH_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback); +#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \ + C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__) + +IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false); + +bool isMPSPinnedPtr(const void* data); + +} // namespace at::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSDevice.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSDevice.h new file mode 100644 index 0000000000000000000000000000000000000000..a6edee0a5332ab72e04e334af2c341009fd16817 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSDevice.h @@ -0,0 +1,90 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +typedef id MTLDevice_t; +#else +typedef void* MTLDevice_t; +#endif + +namespace at::mps { + +// Helper enum to check if a MPSGraph op is supported in a given macOS version +enum class MacOSVersion : uint32_t { + MACOS_VER_14_4_PLUS = 0, + MACOS_VER_15_0_PLUS, + MACOS_VER_15_1_PLUS, + MACOS_VER_15_2_PLUS, +}; + +//----------------------------------------------------------------- +// MPSDevice +// +// MPSDevice is a singleton class that returns the default device +//----------------------------------------------------------------- + +class TORCH_API MPSDevice { + public: + /** + * MPSDevice should not be cloneable. + */ + MPSDevice(MPSDevice& other) = delete; + /** + * MPSDevice should not be assignable. + */ + void operator=(const MPSDevice&) = delete; + /** + * Gets single instance of the Device. + */ + static MPSDevice* getInstance(); + /** + * Returns the single device. + */ + MTLDevice_t device() { + return _mtl_device; + } + /** + * Returns whether running on Ventura or newer + */ + bool isMacOS13Plus(MacOSVersion version) const; + + /** + * Returns device name + */ + std::string getName() const; + + /** + * Returns number of GPU cores. + * 1 Core = 16 ExecutionUnit x 8 ALU x 24 threads + */ + unsigned getCoreCount() const; + + ~MPSDevice(); + + private: + static MPSDevice* _device; + MTLDevice_t _mtl_device; + MPSDevice(); +}; + +TORCH_API bool is_available(); +TORCH_API bool is_macos_13_or_newer(MacOSVersion version); +TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false); + +inline Device getDeviceFromPtr(void* ptr) { + return {c10::DeviceType::MPS, 0}; +} + +} // namespace at::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSEvent.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSEvent.h new file mode 100644 index 0000000000000000000000000000000000000000..aee5c72b4f2b6aaca9544e553e38b460144ea6ba --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSEvent.h @@ -0,0 +1,110 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2023 Apple Inc. + +#pragma once + +#include +#include +#include + +namespace at::mps { + +// NOTE: don't create instances of this class directly. +// Use MPSEventPool to acquire instances of MPSEvent. +class MPSEvent { + public: + explicit MPSEvent(id_t ID, MPSStream* stream, bool enable_timing); + ~MPSEvent(); + + // records an event on the stream + void record(bool needsLock, bool syncEvent = false); + // makes all future work submitted to the stream wait for this event. + bool wait(bool needsLock, bool syncEvent = false); + // schedules a notifyListener callback for the event. + bool notify(bool needsLock, MTLSharedEventNotificationBlock block); + // checks if events are already signaled. + bool query() const; + // blocks the CPU thread until all the GPU work that were scheduled + // prior to recording this event are completed. + bool synchronize(); + // resets this event with new parameters in case it gets reused from the event + // pool + void reset(MPSStream* stream, bool enable_timing); + // returns the unique ID of the event instance + id_t getID() const { + return m_id; + } + // returns the completion timestamp of the event + uint64_t getCompletionTime() const { + return m_completion_time; + } + // if already recorded, waits for cpu_sync_cv to be signaled + void waitForCpuSync(); + + private: + id_t m_id; + // enables measuring the completion time of the notifyListener of this event + bool m_enable_timing; + uint64_t m_signalCounter = 0; + MPSStream* m_stream = nullptr; + MTLSharedEvent_t m_event = nullptr; + MTLSharedEventListener* m_listener = nullptr; + // used to sync the events created on this Stream with CPU + std::mutex m_cpu_sync_mutex{}; + std::condition_variable m_cpu_sync_cv{}; + // CondVar predicate to sync the events created on this Stream with CPU + bool m_cpu_sync_completed = false; + // used to compute elapsed time + uint64_t m_completion_time = 0; + + void recordLocked(bool syncEvent); + bool waitLocked(bool syncEvent); + bool notifyLocked(MTLSharedEventNotificationBlock block); + void notifyCpuSync(); + static uint64_t getTime() { + return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW); + } +}; + +typedef std::unique_ptr> MPSEventPtr; + +class MPSEventPool { + public: + explicit MPSEventPool(MPSStream* default_stream); + ~MPSEventPool(); + + MPSEventPtr acquireEvent(bool enable_timing, MPSStream* stream); + void emptyCache(); + + // these are mainly used for MPSHooks and torch.mps.Event() bindings + id_t acquireEvent(bool enable_timing); + void releaseEvent(id_t event_id); + void recordEvent(id_t event_id, bool syncEvent); + void waitForEvent(id_t event_id, bool syncEvent); + void synchronizeEvent(id_t event_id); + bool queryEvent(id_t event_id); + // returns elapsed time between two recorded events in milliseconds + double elapsedTime(id_t start_event_id, id_t end_event_id); + + private: + MPSStream* m_default_stream = nullptr; + std::recursive_mutex m_mutex; + std::stack> m_pool{}; + // dictionary to associate event IDs with event objects + // used to retain in-use events out of the pool + // for torch.mps.Event() bindings. + std::unordered_map m_in_use_events{}; + uint64_t m_event_counter = 0; + std::function m_default_deleter; + + MPSEvent* getInUseEvent(id_t event_id, bool locked = true); +}; + +// shared_ptr is used to get MPSEventPool destroyed after dependent instances +std::shared_ptr getMPSEventPool(); + +} // namespace at::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..01a6d8bc876ab27357ea9c7007d4add08b8d87be --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h @@ -0,0 +1,66 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace mps::detail { + +constexpr uint32_t PHILOX_STATE_N = 7; +struct rng_data_pod { + std::array state{1}; + uint64_t seed = default_rng_seed_val; +}; + +TORCH_API const Generator& getDefaultMPSGenerator(); +TORCH_API Generator +createMPSGenerator(uint64_t seed_val = default_rng_seed_val); + +} // namespace mps::detail + +struct TORCH_API MPSGeneratorImpl : public c10::GeneratorImpl { + // Constructors + MPSGeneratorImpl(uint64_t seed_in = default_rng_seed_val); + ~MPSGeneratorImpl() override = default; + + // MPSGeneratorImpl methods + std::shared_ptr clone() const; + void set_current_seed(uint64_t seed) override; + void set_offset(uint64_t offset) override; + uint64_t get_offset() const override; + uint64_t current_seed() const override; + uint64_t seed() override; + void set_state(const c10::TensorImpl& new_state) override; + c10::intrusive_ptr get_state() const override; + void update_philox_counters(); + + void set_engine(at::Philox4_32 engine) { + engine_ = engine; + } + at::Philox4_32 engine() { + return engine_; + } + uint32_t* state_data() { + return data_.state.data(); + } + static DeviceType device_type() { + return DeviceType::MPS; + } + + private: + mps::detail::rng_data_pod data_; + at::Philox4_32 engine_; + + MPSGeneratorImpl* clone_impl() const override; +}; + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSGuardImpl.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSGuardImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..86b357aeac416c8d001142ed8bd8afb7c7aff8d9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSGuardImpl.h @@ -0,0 +1,187 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::mps { + +typedef MPSEvent* mpsEvent_t; + +// TODO: Move the MPSGuardImpl to inherit from NoOpDeviceGuardImpl +// https://github.com/pytorch/pytorch/issues/77170 +struct TORCH_API MPSGuardImpl final + : public c10::impl::DeviceGuardImplInterface { + static constexpr c10::DeviceType static_type = c10::DeviceType::MPS; + + // constructor + MPSGuardImpl() {} + explicit MPSGuardImpl(c10::DeviceType t) { + TORCH_CHECK( + t == DeviceType::MPS, + "MPSGuardImpl initialized with non-MPS DeviceType: ", + t); + } + + // returns the type + c10::DeviceType type() const override { + return c10::DeviceType::MPS; + } + + Device exchangeDevice(Device d) const override { + return Device(c10::DeviceType::MPS, 0); + } + + Device getDevice() const override { + return Device(c10::DeviceType::MPS, 0); + } + + std::optional uncheckedGetDevice() const noexcept { + return Device(c10::DeviceType::MPS, 0); + } + + void setDevice(Device d) const override { + TORCH_CHECK(d.is_mps(), "Expected a MPS device, but got ", d); + } + + void uncheckedSetDevice(Device d) const noexcept override { + // TODO: Currently setting only device 0 + } + + Stream getStream(Device d) const override { + return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); + } + + Stream getNewStream(Device, int priority = 0) const override { + (void)priority; + return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); + } + + Stream getDefaultStream(Device d) const override { + return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); + } + + // NB: These do NOT set the current device + Stream exchangeStream(Stream s) const override { + return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); + } + DeviceIndex deviceCount() const noexcept override { + if (at::hasMPS()) { + // TODO: extend it for multi-device case + return 1; + } else { + return 0; + } + } + + // Event-related functions + void createEvent(mpsEvent_t* event, const EventFlag flag) const; + + void destroyEvent(void* event, const DeviceIndex device_index) + const noexcept override; + + void record( + void** event, + const Stream& stream, + const DeviceIndex device_index, + const EventFlag flag) const override; + + void block(void* event, const Stream& stream) const override; + + bool queryEvent(void* event) const override; + + void synchronizeEvent(void* event) const override; + + double elapsedTime(void* event1, void* event2, const DeviceIndex device_index) + const override; + + void synchronizeDevice(const DeviceIndex device_index) const override; +}; + +/// A variant of OptionalDeviceGuard that is specialized for MPS. +struct OptionalMPSGuard { + explicit OptionalMPSGuard() : guard_() {} + + explicit OptionalMPSGuard(std::optional device_opt) + : guard_(device_opt) {} + + /// Set the current MPS device to the passed device index, if it is not + /// nullopt + explicit OptionalMPSGuard(std::optional device_index_opt) + : guard_(device_index_opt) {} + + // Copy is not allowed + OptionalMPSGuard(const OptionalMPSGuard&) = delete; + OptionalMPSGuard& operator=(const OptionalMPSGuard&) = delete; + OptionalMPSGuard(OptionalMPSGuard&& other) = delete; + OptionalMPSGuard& operator=(OptionalMPSGuard&& other) = delete; + + /// Sets the MPS device to the given device, initializing the guard if it + /// is not already initialized. Errors if the given device is not a MPS + /// device. + void set_device(Device device) { + guard_.set_device(device); + } + + /// Sets the MPS device to the given device, initializing the guard if it is + /// not already initialized. Errors if the given device is not a MPS device. + void reset_device(Device device) { + guard_.reset_device(device); + } + + /// Sets the MPS device to the given device index, initializing the guard if + /// it is not already initialized. + void set_index(DeviceIndex device_index) { + guard_.set_index(device_index); + } + + /// Returns the device that was set immediately prior to initialization of the + /// guard, or nullopt if the guard is uninitialized. + std::optional original_device() const { + return guard_.original_device(); + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device, if the guard is initialized, + /// or nullopt if the guard is uninitialized. + std::optional current_device() const { + return guard_.current_device(); + } + + /// Restore the original MPS device, resetting this guard to uninitialized + /// state. + void reset() { + guard_.reset(); + } + + private: + c10::impl::InlineOptionalDeviceGuard guard_; +}; + +C10_REGISTER_GUARD_IMPL(MPS, MPSGuardImpl) + +} // namespace at::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSHooks.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSHooks.h new file mode 100644 index 0000000000000000000000000000000000000000..5f743b3e1dff0644d09405dfde7bae7d9fd82f7d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSHooks.h @@ -0,0 +1,76 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include +#include + +namespace at::mps { + +// The real implementation of MPSHooksInterface +struct MPSHooks : public at::MPSHooksInterface { + MPSHooks(at::MPSHooksArgs) {} + void init() const override; + + // MPSDevice interface + bool hasMPS() const override; + bool isOnMacOSorNewer(unsigned major, unsigned minor) const override; + + Device getDeviceFromPtr(void* data) const override; + + // MPSGeneratorImpl interface + const Generator& getDefaultGenerator( + DeviceIndex device_index = -1) const override; + Generator getNewGenerator(DeviceIndex device_index = -1) const override; + + // MPSStream interface + void deviceSynchronize() const override; + void commitStream() const override; + void* getCommandBuffer() const override; + void* getDispatchQueue() const override; + + // MPSAllocator interface + Allocator* getMPSDeviceAllocator() const override; + void emptyCache() const override; + size_t getCurrentAllocatedMemory() const override; + size_t getDriverAllocatedMemory() const override; + size_t getRecommendedMaxMemory() const override; + void setMemoryFraction(double ratio) const override; + bool isPinnedPtr(const void* data) const override; + Allocator* getPinnedMemoryAllocator() const override; + + // MPSProfiler interface + void profilerStartTrace(const std::string& mode, bool waitUntilCompleted) + const override; + void profilerStopTrace() const override; + + // MPSEvent interface + uint32_t acquireEvent(bool enable_timing) const override; + void releaseEvent(uint32_t event_id) const override; + void recordEvent(uint32_t event_id) const override; + void waitForEvent(uint32_t event_id) const override; + void synchronizeEvent(uint32_t event_id) const override; + bool queryEvent(uint32_t event_id) const override; + double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) + const override; + + bool isBuilt() const override { + return true; + } + bool isAvailable() const override { + return hasMPS(); + } + bool hasPrimaryContext(DeviceIndex device_index) const override { + // When MPS is available, it is always in use for the one device. + return true; + } +}; + +} // namespace at::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSProfiler.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSProfiler.h new file mode 100644 index 0000000000000000000000000000000000000000..0c97168d8ce36a2062f40c228694843604ba35a4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSProfiler.h @@ -0,0 +1,472 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifndef __OBJC__ +typedef void* MTLCaptureManager; +#endif + +namespace at::mps { + +namespace Profiler { + +struct BaseInfo { + // profiling info types + enum class Type { + GRAPH, + KERNEL, + COPY, + CPU_FALLBACK, + }; + + BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) + : type(infoType), profileId(Id), handle(Handle) {} + virtual ~BaseInfo() = default; + + // type of profiling info + Type type; + // unique profile ID for execution instances of operations or copies + uint64_t profileId; + // ID generated by os_signpost + // since it's possible to use event and interval-based signposts at the + // same time, we need separate IDs for each. + os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0; + // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - + // GPUStartTime") + std::atomic totalGpuTime{0.0}; + // accumulated Scheduling time in ms (obtained from CompletionHandler's + // "KernelEndTime - KernelStartTime") + std::atomic totalSchedulingTime{0.0}; + // indicates if the operation or copy execution has completed + std::atomic_bool completed{false}; + // handle used to identify the profile info's instance (usually the pointer) + const uintptr_t handle; + + virtual const std::string toString( + double gpuTime = 0, + double schedulingTime = 0) const; + // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()]) + static std::string buildTensorString( + const Tensor& tensor, + bool includeBufferId = false); + static uint64_t getTime() { + return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW); + } +}; + +struct OperationInfo : BaseInfo { + OperationInfo( + const void* Handle, + bool IsGraph, + uint64_t Id, + const std::string& StrKey) + : BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), + strKey(StrKey) {} + + uint64_t runCount = 0; + std::string strKey; + + const std::string toString(double gpuTime = 0, double schedulingTime = 0) + const override; + + // builds a string for a kernel + static std::string buildKernelString( + const std::string& kernelName, + const TensorList& tensors, + bool includeBufferId = false) { + std::stringstream kernelStr; + kernelStr << kernelName; + for (const Tensor& tensor : tensors) { + kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId); + } + return kernelStr.str(); + } +}; + +struct CpuFbInfo : BaseInfo { + CpuFbInfo(uint64_t Id, const std::string& OpName) + : BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) {} + + uint64_t runCount = 0; + // the current and total overhead of copies in bytes required to convert the + // Op's input tensors from MPS to CPU and then output from CPU back to MPS + size_t currentCopyOverhead = 0; + size_t totalCopyOverhead = 0; + std::string opName; + std::string strKey; + uint64_t startTime = 0; + + const std::string toString(double gpuTime = 0, double schedulingTime = 0) + const override; + + void updateCopyOverhead(const TensorList& tensors) { + currentCopyOverhead = 0; + for (const Tensor& tensor : tensors) { + if (tensor.defined()) { + currentCopyOverhead += tensor.nbytes(); + } + } + totalCopyOverhead += currentCopyOverhead; + } +}; + +struct CopyInfo : BaseInfo { + enum class Kind { + MPS_TO_MPS, + MPS_TO_CPU, + CPU_TO_MPS, + }; + + CopyInfo( + const void* Handle, + size_t Length, + uint64_t Id, + bool IsNonBlocking, + bool UsesBlitter) + : BaseInfo(Type::COPY, Id, uintptr_t(Handle)), + kind(Kind::MPS_TO_MPS), + length(Length), + isNonBlocking(IsNonBlocking), + usesBlitter(UsesBlitter) {} + + Kind kind; + size_t length; + bool isNonBlocking; + bool usesBlitter; + std::string srcStrKey; + std::string dstStrKey; + // for copies that don't use blitters, we measure CPU time + uint64_t startTime = 0; + + const std::string toString(double gpuTime = 0, double schedulingTime = 0) + const override; + + static std::string buildTensorString( + const void* buffer, + const OptionalTensorRef tensor, + bool includeBufferId = false); + + static bool isStorageOnMPS( + const void* buffer, + const OptionalTensorRef tensor) { + if (tensor.has_value()) { + return tensor->device().type() == at::kMPS; + } + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer); + // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device + return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0; + } + + static Kind getCopyKind( + const void* srcBuffer, + const void* dstBuffer, + const OptionalTensorRef srcTensor, + const OptionalTensorRef dstTensor) { + const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor); + const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS); + if (isSrcOnMPS && !isDstOnMPS) { + return Kind::MPS_TO_CPU; + } else if (!isSrcOnMPS && isDstOnMPS) { + return Kind::CPU_TO_MPS; + } + return Kind::MPS_TO_MPS; + } +}; + +struct CopyStat : CopyInfo { + explicit CopyStat(std::string CopyKindStr) + : CopyInfo(nullptr, 0, 0, false, false), + kindStr(std::move(CopyKindStr)) {} + // total number of copies + size_t totalCount = 0; + // number of Scalar copies (i.e., less than sizeof(int64)) + size_t scalarsCount = 0; + // number of blocking copies (i.e., require syncing to GPU) + size_t blockingCount = 0; + // number of copies that used memcpy(), instead of Metal Blit Encoder + size_t memcpyCount = 0; + // accumulated GPU time in ms for the scalar copies + std::atomic scalarsGpuTime{0.0}; + // copy kind in string type + std::string kindStr; +}; + +class MPSProfiler { + public: + // lower 16 bits used for profiler options + enum ProfileOptions : uint32_t { + OPTIONS_NONE = 0, + // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, + // etc.) (used for convenience to not compute bit flags by OR-ing manually) + // trace all signpost types using events + ALL_SIGNPOST_EVENTS = (1 << 0), + // trace all signpost types using intervals + ALL_SIGNPOST_INTERVALS = (1 << 1), + // always wait for command buffer to finish executing after each commit + WAIT_UNTIL_COMPLETED = (1 << 2), + // for interval-based signposts, include the scheduling portion of + // Graph/Kernel/Copy executions as well. + // if flag is disable, only "GPU run time" is included in interval, + // and not schedule time. + INCLUDE_SCHEDULE_INTERVAL = (1 << 3), + + // use these if you need to trace signposts types individually (rarely + // required) trace signpost using intervals + USE_INTERVALS = (1 << 4), + // trace signpost by emitting events + USE_EVENTS = (1 << 5), + // used for sanity check (Change this when new option added) + OPTIONS_COUNT = (USE_EVENTS << 1) - 1, + }; + + // when adding new types, #define the type string in MPSProfiler.mm as well. + // upper 16 bits used for event types + enum SignpostTypes : uint32_t { + SIGNPOST_NONE = 0, + // trace signposts for PyTorch operation executions + RUN_OPERATION = (1 << 16), + // trace signposts for blitter copies + BLIT_COPY = (1 << 17), + // trace signposts for ops that fall back on CPU + CPU_FALLBACK = (1 << 18), + // used for sanity check (Change this when new type added) + SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1, + }; + + enum LogOptions : uint32_t { + LOG_NONE = 0, + + // Info logging options during execution + // ------------------------------------- + // prints operation info (id/key/run_count) during execution + OPERATION_INFO = (1 << 0), + // prints copy info (src/dst tensors/buffers, size, etc.) during execution + COPY_INFO = (1 << 1), + // prints CPU Fallback info (id/runCount/opName/copyOverhead) during + // execution + CPU_FALLBACK_INFO = (1 << 2), + + // Profiling Statistics logging options when process terminates + // ------------------------------------------------------------ + // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before + // process terminates this is convenient to not combine following stats bit + // flags manually + ALL_STATS = (1 << 3), + // prints operation stats (GPU times, run count, etc.) before process + // terminates + OPERATION_STATS = (1 << 4), + // prints copies stats (GPU times, copy kinds, sizes, etc.) before process + // terminates + COPY_STATS = (1 << 5), + // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies + // for tensors, etc.) before process terminates + CPU_FALLBACK_STATS = (1 << 6), + + // Metadata format options when logging the info + // --------------------------------------------- + // if enabled, includes GPU run time in metadata (i.e., + // GPUEndTime-GPUStartTime from Metal Command Buffers) (e.g., [GPU=0.324 + // ms]) + INCLUDE_GPU_TIME = (1 << 7), + // if enabled, includes GPU scheduling time in metadata separately + // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers) + // e.g., [GPU=0.324 ms, KRNL=0.036 ms] + INCLUDE_KERNEL_TIME = (1 << 8), + // if enabled, includes the unique buffer ID in metadata for the storage + // of a tensor that was allocated on MPSAllocator. This is useful (along + // with the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are + // involved with various operations. + INCLUDE_BUFFER_ID = (1 << 9), + + // used for sanity check (Change this when new option added) + LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1, + }; + + explicit MPSProfiler(); + ~MPSProfiler(); + + // the handle is either "MPSGraph*" or "id" for Metal + // Kernels the beginProfile*() functions return a profileId which is unique + // per graph/kernel/copy + uint64_t beginProfileKernel( + const void* handle, + const std::string& strKey, + bool isGraph); + uint64_t beginProfileKernel( + const void* handle, + const std::string& kernelName, + const TensorList& tensors); + uint64_t beginProfileCopy( + const void* srcBuffer, + const void* dstBuffer, + const OptionalTensorRef srcTensor, + const OptionalTensorRef dstTensor, + size_t length, + bool isNonBlocking, + bool usesBlitter = true); + uint64_t beginProfileCPUFallback( + const std::string& opName, + const TensorList& tensors); + void beginProfileGPUInterval(const void* handle); + + void endProfileCopy(uint64_t profileId, SyncType syncType); + void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE); + void endProfileCPUFallback(const std::string& opName); + + // these are used to hook into Python bindings for torch.mps.profiler module. + // this enables generating OS Signpost traces from MPSProfiler on-demand + // during runtime (instead of environment variables). + // The "mode" could be either "interval", "event", or both "interval,event" + // for interval-based and/or event-based signpost tracing. + void StartTrace(const std::string& mode, bool waitUntilCompleted); + void StopTrace(); + + // Abstractions for GPU trace capturing + bool isCaptureEnabled() const; + bool isCapturing() const; + void startCapture(const std::string& name, MPSStream* stream = nullptr); + void stopCapture(MPSStream* stream = nullptr); + + // convenience functions to indicate whether signpost tracing or + // logging are enabled for the SignpostTypes + bool isOperationProfilingEnabled() const { + return (m_signpost_types & SignpostTypes::RUN_OPERATION) || + (m_log_options & + (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS)); + } + bool isCopyProfilingEnabled() const { + return (m_signpost_types & SignpostTypes::BLIT_COPY) || + (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS)); + } + bool isCPUFallbackProfilingEnabled() const { + return (m_signpost_types & SignpostTypes::CPU_FALLBACK) || + (m_log_options & + (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS)); + } + bool isSignpostTracingEnabled() const { + return (m_signpost_types != SignpostTypes::SIGNPOST_NONE); + } + + private: + // indicates what type of signpost types are enabled and traced by MPS + // profiler. + uint32_t m_signpost_types = 0; + uint32_t m_profile_options = 0; + uint32_t m_log_options = 0; + uint64_t m_kernel_counter = 0; + uint64_t m_graph_counter = 0; + uint64_t m_cpu_fb_counter = 0; + uint64_t m_copy_counter = 0; + // technically, it's possible to trace both events and intervals at the same + // time so we use separate os_log categories for them + os_log_t m_os_log_events; + os_log_t m_os_log_intervals; + // stats logging could run either from destructor or signal handler + // so this is used to check if logging has already started. + std::atomic_bool hasLoggedStats{false}; + // indicates there are pending completionHandler callbacks that haven't been + // called yet. + std::atomic_bool hasPendingCompletionHandlers{false}; + // used to capture sigint signal to log profiling stats + static struct sigaction currentSigint, previousSigint; + + // We use the following lists for two reasons: + // 1- for interval-based signposts the "begin" point won't be in same function + // as the "end" point where we need to be able to retrieve signpost's info + // 2- if Operations info need to be logged when process ends using + // LogOptions::OPERATION_INFO. + + // the pointer key for this map is either "MPSGraph*" or + // "id" for Metal Kernels this list is retained and + // could be logged along with aggregate profiling numbers when the process + // ends. + std::unordered_map> + m_op_info_list{}; + // the string key for this map is the op name that we fall back to execute on + // CPU this list is retained and could be logged along with aggregate + // profiling numbers when the process ends. + std::unordered_map> + m_cpu_fb_info_list{}; + // this list contains the info for copies, and its key is the unique profileId + // which is generated from m_copy_counter + // The copyInfo list is not retained. + std::unordered_map> m_copy_info_list{}; + // a short list that contains copy stats + std::unordered_map> + m_copy_stat_list{}; + + mutable MTLCaptureManager* captureManager = nil; + unsigned captureCount = 0; + + void initialize(); + void beginProfileExecution(BaseInfo& info, bool cpuExecution = false); + void endProfileExecution( + BaseInfo& info, + os_signpost_id_t event_signpost_id, + os_signpost_id_t interval_signpost_id, + double gpuTime, + double schedulingTime); + void addProfilerScheduledHandler(BaseInfo& info); + void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType); + void emitSignpostEvent( + SignpostTypes signpost_type, + os_signpost_id_t signpost_id, + const std::string& msg) const; + void beginSignpostInterval( + SignpostTypes signpost_type, + os_signpost_id_t signpost_id, + const std::string& msg) const; + void endSignpostInterval( + SignpostTypes signpost_type, + os_signpost_id_t signpost_id) const; + + void updateCopyStats( + const CopyInfo& copyInfo, + double gpuTime, + double schedulingTime); + // returns true if logging the profiling info "during the execution" is + // enabled + bool isProfileInfoLoggingEnabled( + BaseInfo::Type infoType, + bool isExecutionEnded); + // logs all the profiling stats that are enabled + void logProfilingStats(); + // logs kernel profiling stats when the process ends. + void logOperationsProfilingStats(std::FILE* f) const; + // logs CPU Fallback profiling stats when the process ends. + void logCPUFallbackProfilingStats(std::FILE* f) const; + // logs copy profiling stats when the process ends. + void logCopyProfilingStats(std::FILE* f) const; + + os_signpost_id_t generateSignpostId( + os_signpost_type_t signpostType, + const void* ptr = nullptr); + static SignpostTypes getSignpostType(BaseInfo::Type infoType); + static void handleIntSignal(int signal); +}; + +} // namespace Profiler + +Profiler::MPSProfiler& getMPSProfiler(); + +} // namespace at::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSStream.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSStream.h new file mode 100644 index 0000000000000000000000000000000000000000..e721649f42a607942a06630b7b9b9b0970049174 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/mps/MPSStream.h @@ -0,0 +1,171 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include + +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +#include +typedef MPSCommandBuffer* MPSCommandBuffer_t; +typedef id MTLCommandQueue_t; +typedef id MTLComputeCommandEncoder_t; +typedef id MTLSharedEvent_t; +typedef id MTLDevice_t; +typedef id MTLBuffer_t; +#else +#include +typedef void* MPSCommandBuffer_t; +typedef void* MPSGraph; +typedef void* MPSGraphExecutionDescriptor; +typedef void* MPSGraphCompilationDescriptor; +typedef void* MTLCommandQueue_t; +typedef void* MTLComputeCommandEncoder_t; +typedef void* MTLSharedEvent_t; +typedef void* MTLDevice_t; +typedef void* MTLBuffer_t; +typedef void* MTLCommandBufferHandler; +typedef void* NSDictionary; +#define nil NULL +#endif + +namespace at::mps { + +//----------------------------------------------------------------- +// MPSStream +//----------------------------------------------------------------- + +enum class SyncType { + NONE, // no commit to command buffer + COMMIT, // commit and flush the command buffer + COMMIT_AND_WAIT, // flush and wait for command buffer execution to finish + COMMIT_AND_CONTINUE, // commit and continue with a new underlying command buffer + COMMIT_ADAPTIVE, // commit adaptively based on available memory +}; + +class TORCH_API MPSStream { + public: + enum Unchecked { UNCHECKED }; + + /// Construct a MPSStream from a Stream. This construction is checked, + /// and will raise an error if the Stream is not, in fact, a MPS stream. + explicit MPSStream(Stream stream); + + ~MPSStream(); + + MTLCommandQueue_t commandQueue() const { + return _commandQueue; + } + + dispatch_queue_t queue() const { + return _serialQueue; + } + + MPSCommandBuffer_t commandBuffer(); + MTLComputeCommandEncoder_t commandEncoder(); + void endKernelCoalescing(); + void synchronize(SyncType syncType); + void fill(MTLBuffer_t buffer, uint8_t value, size_t length, size_t offset, SyncType syncType = SyncType::NONE); + void copy(MTLBuffer_t srcBuffer, + MTLBuffer_t dstBuffer, + size_t length, + size_t srcOffset, + size_t dstOffset, + uint64_t profileId, + SyncType syncType = SyncType::NONE); + void copy_and_sync(MTLBuffer_t srcBuffer, + MTLBuffer_t dstBuffer, + size_t length, + size_t srcOffset, + size_t dstOffset, + bool non_blocking, + uint64_t profileId); + void executeMPSGraph(MPSGraph* mpsGraph, + NSDictionary* feeds, + NSDictionary* results, + SyncType syncType = SyncType::NONE); + void addCompletedHandler(MTLCommandBufferHandler block); + + /// Get the MPS device index that this stream is associated with. + c10::DeviceIndex device_index() const { + return _stream.device_index(); + } + + MTLCommandQueue_t stream() const { + return _commandQueue; + } + + MTLDevice_t device() const; + + /// Explicit conversion to Stream. + Stream unwrap() const { + return _stream; + } + + MTLBuffer_t getErrorBuffer(); + void checkLastError(); + + private: + Stream _stream; + MTLCommandQueue_t _commandQueue = nil; + MPSCommandBuffer_t _commandBuffer = nil; + MPSCommandBuffer_t _prevCommandBuffer = nil; + MTLComputeCommandEncoder_t _commandEncoder = nil; + MPSGraphExecutionDescriptor* _executionDescriptor = nil; + MPSGraphCompilationDescriptor* _compilationDescriptor = nil; + dispatch_queue_t _serialQueue = nullptr; + // CommitAndContinue is enabled by default + bool _enableCommitAndContinue = true; + // Buffer that contains last raised error + MTLBuffer_t _errorBuffer = nil; + + // use synchronize() to access any of these commit functions outside MPSStream + void commit(); + void commitAndWait(); + void commitAndContinue(); + void flush(); +}; + +/** + * Get the current MPS stream + */ +TORCH_API MPSStream* getCurrentMPSStream(); + +/** + * Get the default MPS stream + */ +TORCH_API MPSStream* getDefaultMPSStream(); + +//----------------------------------------------------------------- +// MPSStreamImpl +//----------------------------------------------------------------- + +class TORCH_API MPSStreamImpl { + public: + /** + * Gets single instance of the MPSStream. + */ + static MPSStream* getInstance(); + + private: + static MPSStream* _stream; + MPSStreamImpl(); +}; + +#ifdef __OBJC__ +void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()); +#endif +} // namespace at::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Activation.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Activation.h new file mode 100644 index 0000000000000000000000000000000000000000..3d91338d6cdd4c3387e4e97eb8a724d60cca3834 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Activation.h @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { +class Scalar; +} + +namespace at { +struct TensorIterator; +struct TensorIteratorBase; +class TensorBase; +} + +namespace at::native { + +using structured_activation_fn = void (*)(TensorIteratorBase&); +using structured_activation_backward_fn = void (*)(TensorIteratorBase&); + +using activation_fn = void (*)(TensorIterator&); +using activation_backward_fn = void (*)(TensorIterator&); +using softplus_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&); +using softplus_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&); +using threshold_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&); +using hardtanh_backward_fn = void (*)(TensorIterator&, const c10::Scalar&, const c10::Scalar&); +using hardsigmoid_fn = void(*)(TensorIteratorBase&); +using hardsigmoid_backward_fn = void(*)(TensorIteratorBase&); +using hardswish_fn = void(*)(TensorIterator&); +using hardswish_backward_fn = void(*)(TensorIterator&); +using shrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using softshrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using shrink_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using elu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&); +using elu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&, bool); +using leaky_relu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using leaky_relu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); +using log_sigmoid_cpu_fn = void (*)(TensorBase&, TensorBase&, const TensorBase&); +using gelu_fn = void (*)(TensorIteratorBase&, GeluType); +using gelu_backward_fn = void (*)(TensorIteratorBase&, GeluType); +using glu_jvp_fn = void (*)(TensorIteratorBase&); + +DECLARE_DISPATCH(elu_fn, elu_stub) +DECLARE_DISPATCH(elu_backward_fn, elu_backward_stub) +DECLARE_DISPATCH(softplus_fn, softplus_stub) +DECLARE_DISPATCH(softplus_backward_fn, softplus_backward_stub) +DECLARE_DISPATCH(log_sigmoid_cpu_fn, log_sigmoid_cpu_stub) +DECLARE_DISPATCH(activation_backward_fn, log_sigmoid_backward_stub) +DECLARE_DISPATCH(threshold_fn, threshold_stub) +DECLARE_DISPATCH(gelu_fn, GeluKernel) +DECLARE_DISPATCH(gelu_backward_fn, GeluBackwardKernel) +DECLARE_DISPATCH(hardtanh_backward_fn, hardtanh_backward_stub) +DECLARE_DISPATCH(hardsigmoid_fn, hardsigmoid_stub) +DECLARE_DISPATCH(hardsigmoid_backward_fn, hardsigmoid_backward_stub) +DECLARE_DISPATCH(hardswish_fn, hardswish_stub) +DECLARE_DISPATCH(hardswish_backward_fn, hardswish_backward_stub) +DECLARE_DISPATCH(shrink_fn, hardshrink_stub) +DECLARE_DISPATCH(softshrink_fn, softshrink_stub) +DECLARE_DISPATCH(shrink_backward_fn, shrink_backward_stub) +DECLARE_DISPATCH(leaky_relu_fn, leaky_relu_stub) +DECLARE_DISPATCH(leaky_relu_backward_fn, leaky_relu_backward_stub) +DECLARE_DISPATCH(structured_activation_fn, glu_stub) +DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub) +DECLARE_DISPATCH(glu_jvp_fn, glu_jvp_stub) +DECLARE_DISPATCH(structured_activation_fn, silu_stub) +DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub) +DECLARE_DISPATCH(structured_activation_fn, mish_stub) +DECLARE_DISPATCH(activation_backward_fn, mish_backward_stub) +DECLARE_DISPATCH(activation_fn, prelu_stub) +DECLARE_DISPATCH(activation_backward_fn, prelu_backward_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/AdaptivePooling.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/AdaptivePooling.h new file mode 100644 index 0000000000000000000000000000000000000000..ce4328d20c33660d7356bd7d96772d46964cf5ca --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/AdaptivePooling.h @@ -0,0 +1,54 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace at::native { + +using adaptive_avg_pooling2d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size); +using adaptive_avg_pooling2d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output); +DECLARE_DISPATCH(adaptive_avg_pooling2d_fn, adaptive_avg_pool2d_kernel) +DECLARE_DISPATCH(adaptive_avg_pooling2d_backward_fn, adaptive_avg_pool2d_backward_kernel) + +using adaptive_max_pooling2d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size); +using adaptive_max_pooling2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); +DECLARE_DISPATCH(adaptive_max_pooling2d_fn, adaptive_max_pool2d_kernel) +DECLARE_DISPATCH(adaptive_max_pooling2d_backward_fn, adaptive_max_pool2d_backward_kernel) + +using adaptive_avg_pooling3d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size); +using adaptive_avg_pooling3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output); +DECLARE_DISPATCH(adaptive_avg_pooling3d_fn, adaptive_avg_pool3d_kernel) +DECLARE_DISPATCH(adaptive_avg_pooling3d_backward_fn, adaptive_avg_pool3d_backward_kernel) + +using adaptive_max_pooling3d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size); +using adaptive_max_pooling3d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); +DECLARE_DISPATCH(adaptive_max_pooling3d_fn, adaptive_max_pool3d_kernel) +DECLARE_DISPATCH(adaptive_max_pooling3d_backward_fn, adaptive_max_pool3d_backward_kernel) + +inline int64_t start_index(int64_t a, int64_t b, int64_t c) { + return (a / b) * c + ((a % b) * c) / b; +} + +inline int64_t end_index(int64_t a, int64_t b, int64_t c) { + return 1 + ((a + 1) * c - 1) / b; +} + +inline void adaptive_pool_empty_output_check(const Tensor& gradOutput_, const char* arg_name) { + int64_t ndim = gradOutput_.ndimension(); + for (const auto i : c10::irange(1, ndim)) { + TORCH_CHECK(gradOutput_.size(i) > 0, + arg_name, "(): Expected grad_output to have non-zero size for non-batch dimensions, " + "but grad_output has sizes ", gradOutput_.sizes(), " with dimension ", i, + " being empty"); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/AmpKernels.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/AmpKernels.h new file mode 100644 index 0000000000000000000000000000000000000000..78a936a046d528c5128ff3293826a804fd0ec5c2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/AmpKernels.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at { +class Tensor; + +namespace native { + +using _amp_foreach_non_finite_check_and_unscale_cpu__fn = void (*)( + TensorList, + Tensor&, + const Tensor&); + +using _amp_update_scale_cpu__fn = Tensor& (*)( + Tensor&, + Tensor&, + const Tensor&, + double, + double, + int64_t); + +DECLARE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu__fn, _amp_foreach_non_finite_check_and_unscale_cpu_stub) +DECLARE_DISPATCH(_amp_update_scale_cpu__fn, _amp_update_scale_cpu_stub) + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h new file mode 100644 index 0000000000000000000000000000000000000000..e5708a286f81bb23689ef96ad2181156951c915f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h @@ -0,0 +1,337 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +// Forward declare TI +namespace at { +class Tensor; +struct TensorIterator; + +namespace native { +enum class TransposeType; +} + +} + +namespace at::native { + +enum class LapackLstsqDriverType : int64_t { Gels, Gelsd, Gelsy, Gelss}; + +#if AT_BUILD_WITH_LAPACK() +// Define per-batch functions to be used in the implementation of batched +// linear algebra operations + +template +void lapackCholesky(char uplo, int n, scalar_t *a, int lda, int *info); + +template +void lapackCholeskyInverse(char uplo, int n, scalar_t *a, int lda, int *info); + +template +void lapackEig(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *w, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, value_t *rwork, int *info); + +template +void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info); + +template +void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info); + +template +void lapackOrmqr(char side, char trans, int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *c, int ldc, scalar_t *work, int lwork, int *info); + +template +void lapackSyevd(char jobz, char uplo, int n, scalar_t* a, int lda, value_t* w, scalar_t* work, int lwork, value_t* rwork, int lrwork, int* iwork, int liwork, int* info); + +template +void lapackGels(char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info); + +template +void lapackGelsd(int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + value_t *s, value_t rcond, int *rank, + scalar_t* work, int lwork, + value_t *rwork, int* iwork, int *info); + +template +void lapackGelsy(int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + int *jpvt, value_t rcond, int *rank, + scalar_t *work, int lwork, value_t* rwork, int *info); + +template +void lapackGelss(int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + value_t *s, value_t rcond, int *rank, + scalar_t *work, int lwork, + value_t *rwork, int *info); + +template +struct lapackLstsq_impl; + +template +struct lapackLstsq_impl { + static void call( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackGels( + trans, m, n, nrhs, + a, lda, b, ldb, + work, lwork, info); + } +}; + +template +struct lapackLstsq_impl { + static void call( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackGelsy( + m, n, nrhs, + a, lda, b, ldb, + jpvt, rcond, rank, + work, lwork, rwork, info); + } +}; + +template +struct lapackLstsq_impl { + static void call( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackGelsd( + m, n, nrhs, + a, lda, b, ldb, + s, rcond, rank, + work, lwork, + rwork, iwork, info); + } +}; + +template +struct lapackLstsq_impl { + static void call( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackGelss( + m, n, nrhs, + a, lda, b, ldb, + s, rcond, rank, + work, lwork, + rwork, info); + } +}; + +template +void lapackLstsq( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackLstsq_impl::call( + trans, m, n, nrhs, + a, lda, b, ldb, + work, lwork, info, + jpvt, rcond, rank, rwork, + s, + iwork); +} + +template +void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info); + +template +void lapackLu(int m, int n, scalar_t *a, int lda, int *ipiv, int *info); + +template +void lapackLdlHermitian( + char uplo, + int n, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* work, + int lwork, + int* info); + +template +void lapackLdlSymmetric( + char uplo, + int n, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* work, + int lwork, + int* info); + +template +void lapackLdlSolveHermitian( + char uplo, + int n, + int nrhs, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* b, + int ldb, + int* info); + +template +void lapackLdlSolveSymmetric( + char uplo, + int n, + int nrhs, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* b, + int ldb, + int* info); + +template +void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info); +#endif + +#if AT_BUILD_WITH_BLAS() +template +void blasTriangularSolve(char side, char uplo, char trans, char diag, int n, int nrhs, scalar_t* a, int lda, scalar_t* b, int ldb); +#endif + +using cholesky_fn = void (*)(const Tensor& /*input*/, const Tensor& /*info*/, bool /*upper*/); +DECLARE_DISPATCH(cholesky_fn, cholesky_stub) + +using cholesky_inverse_fn = Tensor& (*)(Tensor& /*result*/, Tensor& /*infos*/, bool /*upper*/); + +DECLARE_DISPATCH(cholesky_inverse_fn, cholesky_inverse_stub) + +using linalg_eig_fn = void (*)(Tensor& /*eigenvalues*/, Tensor& /*eigenvectors*/, Tensor& /*infos*/, const Tensor& /*input*/, bool /*compute_eigenvectors*/); + +DECLARE_DISPATCH(linalg_eig_fn, linalg_eig_stub) + +// Converts LAPACK's real-valued eigenvector encoding to complex eigenvectors +TORCH_API void linalg_eig_make_complex_eigenvectors( + const Tensor& complex_vectors, + const Tensor& complex_values, + const Tensor& real_vectors); + +DECLARE_DISPATCH( + void(*)(const Tensor&, const Tensor&, const Tensor&), + linalg_eig_make_complex_eigenvectors_stub) + + +using geqrf_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/); +DECLARE_DISPATCH(geqrf_fn, geqrf_stub) + +using orgqr_fn = Tensor& (*)(Tensor& /*result*/, const Tensor& /*tau*/); +DECLARE_DISPATCH(orgqr_fn, orgqr_stub) + +using ormqr_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/, const Tensor& /*other*/, bool /*left*/, bool /*transpose*/); +DECLARE_DISPATCH(ormqr_fn, ormqr_stub) + +using linalg_eigh_fn = void (*)( + const Tensor& /*eigenvalues*/, + const Tensor& /*eigenvectors*/, + const Tensor& /*infos*/, + bool /*upper*/, + bool /*compute_eigenvectors*/); +DECLARE_DISPATCH(linalg_eigh_fn, linalg_eigh_stub) + +using lstsq_fn = void (*)( + const Tensor& /*a*/, + Tensor& /*b*/, + Tensor& /*rank*/, + Tensor& /*singular_values*/, + Tensor& /*infos*/, + double /*rcond*/, + std::string /*driver_name*/); +DECLARE_DISPATCH(lstsq_fn, lstsq_stub) + +using triangular_solve_fn = void (*)( + const Tensor& /*A*/, + const Tensor& /*B*/, + bool /*left*/, + bool /*upper*/, + TransposeType /*transpose*/, + bool /*unitriangular*/); +DECLARE_DISPATCH(triangular_solve_fn, triangular_solve_stub) + +using lu_factor_fn = void (*)( + const Tensor& /*input*/, + const Tensor& /*pivots*/, + const Tensor& /*infos*/, + bool /*compute_pivots*/); +DECLARE_DISPATCH(lu_factor_fn, lu_factor_stub) + +using unpack_pivots_fn = void(*)( + TensorIterator& iter, + const int64_t dim_size, + const int64_t max_pivot); +DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub) + +using lu_solve_fn = void (*)( + const Tensor& /*LU*/, + const Tensor& /*pivots*/, + const Tensor& /*B*/, + TransposeType /*trans*/); +DECLARE_DISPATCH(lu_solve_fn, lu_solve_stub) + +using ldl_factor_fn = void (*)( + const Tensor& /*LD*/, + const Tensor& /*pivots*/, + const Tensor& /*info*/, + bool /*upper*/, + bool /*hermitian*/); +DECLARE_DISPATCH(ldl_factor_fn, ldl_factor_stub) + +using svd_fn = void (*)( + const Tensor& /*A*/, + const bool /*full_matrices*/, + const bool /*compute_uv*/, + const std::optional& /*driver*/, + const Tensor& /*U*/, + const Tensor& /*S*/, + const Tensor& /*Vh*/, + const Tensor& /*info*/); +DECLARE_DISPATCH(svd_fn, svd_stub) + +using ldl_solve_fn = void (*)( + const Tensor& /*LD*/, + const Tensor& /*pivots*/, + const Tensor& /*result*/, + bool /*upper*/, + bool /*hermitian*/); +DECLARE_DISPATCH(ldl_solve_fn, ldl_solve_stub) +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/BinaryOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/BinaryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..d1632f1d978b92db9e0d73e9ebcc460c0afa7c15 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/BinaryOps.h @@ -0,0 +1,124 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + + +namespace at { +struct TensorIterator; +struct TensorIteratorBase; +} + +namespace at::native { + +inline void alpha_check(const ScalarType dtype, const Scalar& alpha) { + TORCH_CHECK(! alpha.isBoolean() || dtype == ScalarType::Bool, + "Boolean alpha only supported for Boolean results."); + TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype) + || alpha.isIntegral(true), + "For integral input tensors, argument alpha must not be a floating point number."); + TORCH_CHECK(isComplexType(dtype) || !alpha.isComplex(), + "For non-complex input tensors, argument alpha must not be a complex number.") +} + +// Basic checking for all sub functions. +inline void sub_check(const TensorBase& self, const TensorBase& other) { + TORCH_CHECK(self.scalar_type() != kBool || other.scalar_type() != kBool, + "Subtraction, the `-` operator, with two bool tensors is not supported. " + "Use the `^` or `logical_xor()` operator instead.") + TORCH_CHECK(self.scalar_type() != kBool && other.scalar_type() != kBool, + "Subtraction, the `-` operator, with a bool tensor is not supported. " + "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead."); +} + +inline void sub_check(const TensorBase& self, const Scalar& scalar) { + TORCH_CHECK(self.scalar_type() != kBool || !scalar.isBoolean(), + "Subtraction, the `-` operator, with two bool tensors is not supported. " + "Use the `^` or `logical_xor()` operator instead.") + TORCH_CHECK(self.scalar_type() != kBool && !scalar.isBoolean(), + "Subtraction, the `-` operator, with a bool tensor is not supported. " + "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead."); +} + +using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha); +using structured_binary_fn_double = void(*)(TensorIteratorBase&, double); +using structured_binary_fn = void(*)(TensorIteratorBase&); + +using binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha); +using binary_fn_double = void(*)(TensorIterator&, double); +using binary_fn = void(*)(TensorIterator&); +using binary_clamp_fn_alpha = + void(*)(TensorIterator&, const Scalar& alpha, const Scalar& min_val, const Scalar& max_val); + +// NB: codegenned +DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub) + +DECLARE_DISPATCH(binary_clamp_fn_alpha, add_clamp_stub) +DECLARE_DISPATCH(structured_binary_fn_alpha, sub_stub) +DECLARE_DISPATCH(structured_binary_fn, mul_stub) +DECLARE_DISPATCH(structured_binary_fn, div_true_stub) +DECLARE_DISPATCH(structured_binary_fn, div_floor_stub) +DECLARE_DISPATCH(structured_binary_fn, div_trunc_stub) +DECLARE_DISPATCH(structured_binary_fn, atan2_stub) +DECLARE_DISPATCH(structured_binary_fn, remainder_stub) +DECLARE_DISPATCH(structured_binary_fn, bitwise_and_stub) +DECLARE_DISPATCH(structured_binary_fn, bitwise_or_stub) +DECLARE_DISPATCH(structured_binary_fn, bitwise_xor_stub) +DECLARE_DISPATCH(structured_binary_fn, lshift_stub) +DECLARE_DISPATCH(structured_binary_fn, rshift_stub) +DECLARE_DISPATCH(binary_fn, logical_xor_stub) +DECLARE_DISPATCH(binary_fn, logical_and_stub) +DECLARE_DISPATCH(binary_fn, logical_or_stub) +DECLARE_DISPATCH(structured_binary_fn, lt_stub) +DECLARE_DISPATCH(structured_binary_fn, le_stub) +DECLARE_DISPATCH(structured_binary_fn, gt_stub) +DECLARE_DISPATCH(structured_binary_fn, ge_stub) +DECLARE_DISPATCH(structured_binary_fn, eq_stub) +DECLARE_DISPATCH(structured_binary_fn, ne_stub) +DECLARE_DISPATCH(binary_fn, max_elementwise_stub) +DECLARE_DISPATCH(binary_fn, min_elementwise_stub) +DECLARE_DISPATCH(structured_binary_fn, maximum_stub) +DECLARE_DISPATCH(structured_binary_fn, minimum_stub) +DECLARE_DISPATCH(structured_binary_fn, fmax_stub) +DECLARE_DISPATCH(structured_binary_fn, fmin_stub) +DECLARE_DISPATCH(structured_binary_fn_double, smooth_l1_stub) +DECLARE_DISPATCH(binary_fn_double, huber_stub) +DECLARE_DISPATCH(structured_binary_fn, sigmoid_backward_stub) +DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub) +DECLARE_DISPATCH(structured_binary_fn, tanh_backward_stub) +DECLARE_DISPATCH(structured_binary_fn, mse_stub) +DECLARE_DISPATCH(structured_binary_fn, fmod_stub) +DECLARE_DISPATCH(structured_binary_fn, logaddexp_stub) +DECLARE_DISPATCH(structured_binary_fn, logaddexp2_stub) +DECLARE_DISPATCH(structured_binary_fn, gcd_stub) +DECLARE_DISPATCH(structured_binary_fn, lcm_stub) +DECLARE_DISPATCH(structured_binary_fn, hypot_stub) +DECLARE_DISPATCH(structured_binary_fn, igamma_stub) +DECLARE_DISPATCH(structured_binary_fn, igammac_stub) +DECLARE_DISPATCH(structured_binary_fn, nextafter_stub) +DECLARE_DISPATCH(structured_binary_fn, heaviside_stub) +DECLARE_DISPATCH(structured_binary_fn, copysign_stub) +DECLARE_DISPATCH(structured_binary_fn, xlogy_stub) +DECLARE_DISPATCH(structured_binary_fn, xlog1py_stub) +DECLARE_DISPATCH(structured_binary_fn, zeta_stub) +DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_t_stub) +DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_u_stub) +DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_v_stub) +DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_w_stub) +DECLARE_DISPATCH(structured_binary_fn, hermite_polynomial_h_stub) +DECLARE_DISPATCH(structured_binary_fn, hermite_polynomial_he_stub) +DECLARE_DISPATCH(structured_binary_fn, laguerre_polynomial_l_stub) +DECLARE_DISPATCH(structured_binary_fn, legendre_polynomial_p_stub) +DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_t_stub) +DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_u_stub) +DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_v_stub) +DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_w_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/BucketizationUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/BucketizationUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..dcf766d6e007a78e4be6de84e4ac693467604b34 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/BucketizationUtils.h @@ -0,0 +1,178 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +// original values given by raw_*. If an original value is not contiguous, will make a contiguous copy to +// the corresponding trimmed_* value. Additionally, if the dtypes of the boundary and input tensor do not +// match, will change them to be a common super type so comparisons are done between the same types. +// For any trimmed_* tensor, if its outgoing value matches what it was incoming (typically null), then the +// corresponding raw_* version should be used since it was already contiguous of the right type. +inline void searchsorted_maybe_trim_input_tensors( + Tensor& trimmed_input, + Tensor& trimmed_boundaries, + Tensor& trimmed_sorter, + const Tensor& raw_input, + const Tensor& raw_boundaries, + const Tensor& raw_sorter) { + bool in_is_contiguous = raw_input.is_contiguous(); + bool bd_is_contiguous = raw_boundaries.is_contiguous(); + bool sort_is_contiguous = raw_sorter.is_contiguous(); + + if (!in_is_contiguous) { + TORCH_WARN_ONCE("torch.searchsorted(): input value tensor is non-contiguous, this will lower the performance due " + "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous input value " + "tensor if possible. This message will only appear once per program."); + trimmed_input = raw_input.contiguous(); + } + if (!bd_is_contiguous) { + TORCH_WARN_ONCE("torch.searchsorted(): boundary tensor is non-contiguous, this will lower the performance due " + "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous boundary " + "tensor if possible. This message will only appear once per program."); + trimmed_boundaries = raw_boundaries.contiguous(); + } + if (!sort_is_contiguous) { + TORCH_WARN_ONCE("torch.searchsorted(): sorter tensor is non-contiguous, this will lower the performance due " + "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous sorter " + "tensor if possible. This message will only appear once per program."); + trimmed_sorter = raw_sorter.contiguous(); + } + if (raw_input.dtype() != raw_boundaries.dtype()) { + at::native::ResultTypeState state = {}; + state = at::native::update_result_type_state(raw_boundaries, state); + state = at::native::update_result_type_state(raw_input, state); + ScalarType common_stype = at::native::result_type(state); + + TORCH_INTERNAL_ASSERT(common_stype != ScalarType::Undefined); + if (common_stype != raw_input.scalar_type()) { + trimmed_input = in_is_contiguous ? raw_input.to(common_stype) : trimmed_input.to(common_stype); + } + if (common_stype != raw_boundaries.scalar_type()) { + trimmed_boundaries = bd_is_contiguous ? raw_boundaries.to(common_stype) : trimmed_boundaries.to(common_stype); + } + } +} + +/* unused but needed for internal jagged tensor class */ +inline void searchsorted_maybe_trim_input_tensors( + Tensor& trimmed_input, + Tensor& trimmed_boundaries, + const Tensor& raw_input, + const Tensor& raw_boundaries) { + Tensor trimmed_sorter; + Tensor raw_sorter; + searchsorted_maybe_trim_input_tensors( + trimmed_input, + trimmed_boundaries, + trimmed_sorter, + raw_input, + raw_boundaries, + raw_sorter); +} + +inline bool searchsorted_dims_matched_before_last_dim(const Tensor& boundaries, const Tensor& input) { + if (boundaries.dim() != input.dim()) { + return false; + } + const auto& dims_bd = boundaries.sizes(); + const auto& dims_in = input.sizes(); + for (int64_t dim = 0; dim + 1 < boundaries.dim(); ++dim) { + if (dims_bd[dim] != dims_in[dim]) { + return false; + } + } + return true; +} + +inline Tensor searchsorted_scalar_tensor(const Scalar& scalar, const c10::Device& device) { + auto tensor = c10::scalar_to_tensor(scalar, device); + // This is to adopt the scalar promotion rules defined in native/TypeProperties.h + // So we have the same type promotion rules as binary operations. + tensor.unsafeGetTensorImpl()->set_wrapped_number(true); + return tensor; +} + +inline void searchsorted_pre_check( + const Tensor& boundaries, + const Tensor& input, + const Tensor& output, + const bool out_int32, + const bool right, + const std::optional side_opt, + const Tensor& sorter) { + if (side_opt) { + const std::string_view side = *side_opt; + TORCH_CHECK(side == "left" || side == "right", "torch.searchsorted(): side can only be 'left' or 'right' but ", + "got ", side); + + // assume the user has not explicitly set (right=False, side="right") + TORCH_CHECK(!right || side == "right", "torch.searchsorted(): side and right can't be set to opposites, got side " + "of ", side, " while right was True"); + } + + TORCH_CHECK(boundaries.device() == input.device(), "torch.searchsorted(): boundaries and input value tensors ", + "should have same device type, but got boundaries tensor device type ", boundaries.device(), " and input value ", + "tensor device type ", input.device()); + + if (sorter.defined()) { + TORCH_CHECK(sorter.device() == boundaries.device(), "torch.searchsorted(): sorter and boundary tensors should ", + "have same device type, but got sorter tensor device type ", sorter.device(), " and input value tensor ", + "device type ", boundaries.device()); + + TORCH_CHECK(sorter.sizes() == boundaries.sizes(), "torch.searchsorted(): boundary and sorter must have the same " + "size, but got boundary tensor ", boundaries.sizes(), "and got sorter tensor ", sorter.sizes()); + + TORCH_CHECK(sorter.scalar_type() == ScalarType::Long, "torch.searchsorted(): sorter must be a tensor of long ", + "dtype but got dtype ", sorter.scalar_type()); + + if (sorter.numel() > 0) { + auto minmax = sorter.aminmax(); + int64_t vmin = std::get<0>(minmax).item().toLong(); + int64_t vmax = std::get<1>(minmax).item().toLong(); + TORCH_CHECK(vmin >= 0 && vmax < sorter.sizes().back(), "torch.searchsorted(): sorter index out of range"); + } + } + + TORCH_CHECK(input.dim() > 0 || (input.dim() == 0 && input.numel() == 1 && boundaries.dim() == 1), + "torch.searchsorted(): input value can be a scalar only when boundaries tensor dimension is 1, but we got ", + "boundaries tensor dim(", boundaries.dim(), ") and input value's dim(", input.dim(), ") numel(", + input.numel(), ")"); + + TORCH_CHECK(boundaries.dim() != 0, "torch.searchsorted(): boundaries tensor should have positive dimension, but ", + "got 0 dimension"); + + TORCH_CHECK(boundaries.dim() == 1 || searchsorted_dims_matched_before_last_dim(boundaries, input), + "torch.searchsorted(): boundaries tensor should be 1 dimension or the first N-1 dimensions of boundaries tensor ", + "and input value tensor must match, but we got boundaries tensor ", boundaries.sizes(), " and input value tensor ", + input.sizes()); + + ScalarType output_dtype = output.scalar_type(); + TORCH_CHECK( + (output_dtype == ScalarType::Long && !out_int32) || + (output_dtype == ScalarType::Int && out_int32), + "torch.searchsorted(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on ", + "whether out_int32 flag is True, but we got output tensor's dtype ", output_dtype, + " and out_int32 flag is ", (out_int32 ? "True" : "False")); + + if (out_int32) { + TORCH_CHECK(boundaries.sizes().back() < INT_MAX, + "torch.searchsorted(): the size of boundaries' last dimension should be less than ", INT_MAX, ", but we got ", + boundaries.sizes().back()); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CPUBlas.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CPUBlas.h new file mode 100644 index 0000000000000000000000000000000000000000..7cdf10c6d4fdccafa542cbc37da584a0fe17835d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CPUBlas.h @@ -0,0 +1,319 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + + +namespace at::native::cpublas { + +namespace internal { +void normalize_last_dims( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + int64_t *lda, int64_t *ldb, int64_t *ldc); +} // namespace internal + +using gemm_fn = void(*)( + at::ScalarType type, + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const Scalar& alpha, + const void *a, int64_t lda, + const void *b, int64_t ldb, + const Scalar& beta, + void *c, int64_t ldc); + +DECLARE_DISPATCH(gemm_fn, gemm_stub) + +using gemm_no_downcast_fn = void(*)( + at::ScalarType type, + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const Scalar& alpha, + const void *a, int64_t lda, + const void *b, int64_t ldb, + const Scalar& beta, + void *c, int64_t ldc); + +DECLARE_DISPATCH(gemm_no_downcast_fn, gemm_no_downcast_stub) + +template +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + at::opmath_type alpha, + const scalar_t *a, int64_t lda, + const scalar_t *b, int64_t ldb, + at::opmath_type beta, + scalar_t *c, int64_t ldc) { + internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc); + gemm_stub( + kCPU, c10::CppTypeToScalarType::value, + transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); +} + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + double alpha, + const double *a, int64_t lda, + const double *b, int64_t ldb, + double beta, + double *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + float alpha, + const float *a, int64_t lda, + const float *b, int64_t ldb, + float beta, + float *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + float alpha, + const at::BFloat16 *a, int64_t lda, + const at::BFloat16 *b, int64_t ldb, + float beta, + at::BFloat16 *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const float alpha, + const at::BFloat16 *a, int64_t lda, + const at::BFloat16 *b, int64_t ldb, + const float beta, + float *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + float alpha, + const at::Half *a, int64_t lda, + const at::Half *b, int64_t ldb, + float beta, + at::Half *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const float alpha, + const at::Half *a, int64_t lda, + const at::Half *b, int64_t ldb, + const float beta, + float *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + c10::complex alpha, + const c10::complex *a, int64_t lda, + const c10::complex *b, int64_t ldb, + c10::complex beta, + c10::complex *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + c10::complex alpha, + const c10::complex *a, int64_t lda, + const c10::complex *b, int64_t ldb, + c10::complex beta, + c10::complex *c, int64_t ldc); + +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + int64_t alpha, + const int64_t *a, int64_t lda, + const int64_t *b, int64_t ldb, + int64_t beta, + int64_t *c, int64_t ldc); + +template +void gemm_batched( + TransposeType transa, TransposeType transb, + int64_t batch_size, int64_t m, int64_t n, int64_t k, + scalar_t alpha, + const scalar_t * const *a, int64_t lda, + const scalar_t * const *b, int64_t ldb, + const scalar_t beta, + scalar_t * const *c, int64_t ldc); + +template +void gemm_batched_with_stride( + TransposeType transa, TransposeType transb, + int64_t batch_size, int64_t m, int64_t n, int64_t k, + scalar_t alpha, + const scalar_t *a, int64_t lda, int64_t batch_stride_a, + const scalar_t *b, int64_t ldb, int64_t batch_stride_b, + scalar_t beta, + scalar_t *c, int64_t ldc, int64_t batch_stride_c); + +using axpy_fn = void(*)(at::ScalarType type, int64_t n, const Scalar& a, const void *x, int64_t incx, void *y, int64_t incy); + +DECLARE_DISPATCH(axpy_fn, axpy_stub) + +template +void axpy(int64_t n, scalar_t a, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy){ + if(n == 1) + { + incx = 1; + incy = 1; + } + axpy_stub( + kCPU, c10::CppTypeToScalarType::value, + n, a, x, incx, y, incy); +} + +void axpy(int64_t n, double a, const double *x, int64_t incx, double *y, int64_t incy); +void axpy(int64_t n, float a, const float *x, int64_t incx, float *y, int64_t incy); +void axpy(int64_t n, c10::complex a, const c10::complex *x, int64_t incx, c10::complex *y, int64_t incy); +void axpy(int64_t n, c10::complex a, const c10::complex *x, int64_t incx, c10::complex *y, int64_t incy); + +using copy_fn = void(*)(at::ScalarType type, int64_t n, const void *x, int64_t incx, void *y, int64_t incy); + +DECLARE_DISPATCH(copy_fn, copy_stub) + +template +void copy(int64_t n, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy) { + if(n == 1) + { + incx = 1; + incy = 1; + } + copy_stub( + kCPU, c10::CppTypeToScalarType::value, + n, x, incx, y, incy); +} + +void copy(int64_t n, const double *x, int64_t incx, double *y, int64_t incy); +void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy); +void copy(int64_t n, const c10::complex *x, int64_t incx, c10::complex *y, int64_t incy); +void copy(int64_t n, const c10::complex *x, int64_t incx, c10::complex *y, int64_t incy); + +// Batch-reduce GEMM +// Operates by the following formula: +// C = SUM(A[i] x B[i]) + C if add_C is true, i = 0 to batch size +// A Base pointer to a tensor A. +// B Base pointer to a tensor B. +// C Pointer to a tensor C (accumulation buffer). +// Note only batch size 1 is used currently + +// Define macros for available brgemm APIs +// so that callers can determine which APIs are available +#define CPUBLAS_BRGEMM_F16F16F32 // half * half -> float +#define CPUBLAS_BRGEMM_BF16BF16F32 // bfloat16 * bfloat16 -> float +#define CPUBLAS_BRGEMM_F32F32F32 // float * float -> float +#define CPUBLAS_BRGEMM_U8U8I32 // unsigned char * unsigned char -> int32 +#define CPUBLAS_BRGEMM_U8I8I32 // unsigned char * signed char -> int32 +#define CPUBLAS_BRGEMM_I8I8I32 // signed char * signed char -> int32 + +TORCH_API void brgemm( + int64_t M, + int64_t N, + int64_t K, + int64_t ld_a, + int64_t ld_b, + int64_t ld_c, + const bool add_C, + const at::Half* A, + const at::Half* B, + float* C, + bool is_vnni = true); + +TORCH_API void brgemm( + int64_t M, + int64_t N, + int64_t K, + int64_t ld_a, + int64_t ld_b, + int64_t ld_c, + const bool add_C, + const at::BFloat16* A, + const at::BFloat16* B, + float* C, + bool is_vnni = true); + +TORCH_API void brgemm( + int64_t M, + int64_t N, + int64_t K, + int64_t ld_a, + int64_t ld_b, + int64_t ld_c, + const bool add_C, + const float* A, + const float* B, + float* C, + bool is_vnni = false); + +TORCH_API void brgemm( + int64_t M, + int64_t N, + int64_t K, + int64_t ld_a, + int64_t ld_b, + int64_t ld_c, + const bool add_C, + const unsigned char* A, + const unsigned char* B, + int32_t* C, + bool is_vnni = true); + +TORCH_API void brgemm( + int64_t M, + int64_t N, + int64_t K, + int64_t ld_a, + int64_t ld_b, + int64_t ld_c, + const bool add_C, + const unsigned char* A, + const signed char* B, + int32_t* C, + bool is_vnni = true); + +TORCH_API void brgemm( + int64_t M, + int64_t N, + int64_t K, + int64_t ld_a, + int64_t ld_b, + int64_t ld_c, + const bool add_C, + const signed char* A, + const signed char* B, + int32_t* C, + bool is_vnni = true); + +// Release brgemm hardware context +TORCH_API void brgemm_release(bool is_vnni = true); + +// Pack B matrix to get better performance if needed +TORCH_API void pack( + int64_t K, + int64_t N, + int64_t ld_in, + int64_t ld_out, + ScalarType dt_in, + ScalarType dt_out, + const void* in, + void* out); + +// Whether pack is supported in the platform. +TORCH_API bool could_pack(ScalarType dt_in); + +} // namespace at::native::cpublas + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CPUFallback.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CPUFallback.h new file mode 100644 index 0000000000000000000000000000000000000000..daad5e2fdad114d3a45076e682d5f9c0417db9f3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CPUFallback.h @@ -0,0 +1,51 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace at::native { + +// This function implements a boxed fallback to CPU. +// External backends can add their own custom logging on top if it to customize their own CPU fallbacks. +TORCH_API void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool error_on_views = false, + c10::DispatchKey cpu_dispatch_key = c10::DispatchKey::CPU); + +// This is a helper function that backends can use to directly call their boxed CPU fallback +// TODO: update and add a usage example after https://github.com/pytorch/pytorch/pull/58092 lands. +template +struct _call_fallback_fn final {}; + +template +struct _call_fallback_fn final { + static ReturnType call(typename c10::maybe_keep_symint::type... args) { + auto op = c10::Dispatcher::singleton() + // TODO: figure out how to make compiler happy without dynamic casts + .findSchemaOrThrow((const char*) Op::name, (const char*) Op::overload_name) + //.findSchemaOrThrow("a", "b") + .typed::type...)>(); + return c10::impl::BoxedKernelWrapper::type...)>::call( + c10::BoxedKernel::makeFromFunction(), + op, + c10::DispatchKeySet(), // we know that the cpu_fallback doesn't use the dispatch keyset. + // TODO: get std::forward<> to work + args... + ); + } +}; + +template +using call_fallback_fn_symint = _call_fallback_fn; + +template +using call_fallback_fn = _call_fallback_fn; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h new file mode 100644 index 0000000000000000000000000000000000000000..71cb3ddfcfb5f1634532e29798dd6868bb8b5a3c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h @@ -0,0 +1,18 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +TORCH_API bool canUse32BitIndexMath(const at::TensorBase &t, int64_t max_elem=std::numeric_limits::max()); + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ComplexHelper.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ComplexHelper.h new file mode 100644 index 0000000000000000000000000000000000000000..3ba035b9ea17a17603bc3b445014b4e5f31aa5b3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ComplexHelper.h @@ -0,0 +1,102 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include + +#include +#endif + +// WARNING: this header contains non-inline functions and should be only +// included from ONE cpp file + +namespace at::native { + +// View tensor with new dtype, storage offset, sizes and strides +inline Tensor view_tensor( + const Tensor &tensor, ScalarType dtype, + c10::SymInt offset, SymIntArrayRef sizes, SymIntArrayRef strides) { + Storage storage = tensor.storage(); + auto key_set = tensor.key_set().remove(DispatchKey::Conjugate); + auto new_tensor = detail::make_tensor( + c10::TensorImpl::VIEW, std::move(storage), key_set, scalarTypeToTypeMeta(dtype)); + auto * impl = new_tensor.unsafeGetTensorImpl(); + impl->set_sizes_and_strides(sizes, strides, offset); + return new_tensor; +} + +inline SymDimVector computeStrideForViewAsReal(SymIntArrayRef oldstride) { + SymDimVector res(oldstride.size() + 1); + for (const auto i : c10::irange(oldstride.size())) { + res[i] = oldstride[i] * 2; + } + res.back() = 1; + return res; +} + +inline Tensor _view_as_real_physical(const Tensor& self) { + TORCH_CHECK(self.is_complex(), "view_as_real is only supported for complex tensors"); + auto old_sizes = self.sym_sizes(); + SymDimVector new_sizes(old_sizes.size() + 1); + std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin()); + // last dimension will always have two elements containing the real and imag vals + new_sizes.back() = 2; + auto new_strides = computeStrideForViewAsReal(self.sym_strides()); + auto new_storage_offset = self.sym_storage_offset() * 2; + const auto float_type = c10::toRealValueType(self.scalar_type()); + auto real_tensor = view_tensor(self, float_type, std::move(new_storage_offset), new_sizes, new_strides); + return real_tensor; +} + +// expects as input a complex tensor and returns back a tensor +// with corresponding real dtype containing the complex values +// in the last two dimensions +Tensor view_as_real(const Tensor& self) { + TORCH_CHECK(!self.is_conj(), "view_as_real doesn't work on unresolved conjugated tensors. To resolve the conjugate tensor so you can view it as real, use self.resolve_conj(); however, be warned that the resulting tensor will NOT alias the original."); + return _view_as_real_physical(self); +} + +inline SymDimVector computeStrideForViewAsComplex(SymIntArrayRef oldstride) { + const auto dim = oldstride.size(); + TORCH_CHECK(dim > 0 && oldstride[dim - 1] == 1, "Tensor must have a last dimension with stride 1"); + + SymDimVector res(dim - 1); + for (const auto i : c10::irange(res.size())) { + TORCH_CHECK(oldstride[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension"); + res[i] = oldstride[i] / 2; + } + return res; +} + +// expects as input a float or double tensor with last dimension of size 2 +// and returns back a tensor with corresponding complex dtype +Tensor view_as_complex(const Tensor& self) { + TORCH_CHECK( + self.scalar_type() == kFloat || self.scalar_type() == kDouble || self.scalar_type() == kHalf, + "view_as_complex is only supported for half, float and double tensors, but got a tensor of scalar type: ", self.scalar_type()); + + auto old_sizes = self.sym_sizes(); + TORCH_CHECK(!old_sizes.empty(), "Input tensor must have one or more dimensions"); + TORCH_CHECK(old_sizes[old_sizes.size()-1] == 2, "Tensor must have a last dimension of size 2"); + SymDimVector new_sizes(old_sizes.begin(), old_sizes.end() - 1); + + const auto new_strides = computeStrideForViewAsComplex(self.sym_strides()); + const auto complex_type = c10::toComplexType(self.scalar_type()); + + TORCH_CHECK(self.sym_storage_offset() % 2 == 0, "Tensor must have a storage_offset divisible by 2"); + const auto new_storage_offset = self.sym_storage_offset() / 2; + + return view_tensor(self, complex_type, new_storage_offset, new_sizes, new_strides); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..ab99b0ce5496931d41695af614e67afdfd6af437 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h @@ -0,0 +1,39 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +struct TupleInfoCPU { + template + using tuple = std::tuple; + + template + static constexpr auto tie(Types&... args) noexcept { + return std::tie(args...); + } +}; + +template +using CompositeRandomAccessorCPU = + CompositeRandomAccessor; + +template +void swap( + references_holder rh1, + references_holder rh2 +) { + return std::swap(rh1.data(), rh2.data()); +} + +template +auto get(references_holder rh) -> decltype(std::get(rh.data())) { + return std::get(rh.data()); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..51401ef05a7ea39d3f7969424f50fe838652dd35 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h @@ -0,0 +1,268 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#pragma once + +namespace at::native { + +namespace { + +// operator_brackets_proxy is used in +// CompositeRandomAccessor in place of operator[]. +// For some iterators, references returned by operator[] +// could become invalid, operator_brackets_proxy tries to +// resolve that by making accessor[n] to be equivalent to +// *(accessor + n). +template +class operator_brackets_proxy { + using reference = typename std::iterator_traits::reference; + using value_type = typename std::iterator_traits::value_type; + +public: + C10_HOST_DEVICE + operator_brackets_proxy(Accessor const& accessor) + : accessor(accessor) + {} + + C10_HOST_DEVICE + operator reference() { + return *accessor; + } + + C10_HOST_DEVICE + reference operator*() { + return *accessor; + } + + C10_HOST_DEVICE + operator_brackets_proxy& operator=(value_type const& val) { + *accessor = val; + return *this; + } + +private: + Accessor accessor; +}; + +} + +// references_holder is used as a surrogate for the +// references type from std::iterator_traits in CompositeRandomAccessor. +// It is assumed in CompositeRandomAccessor that +// References = tuple, +// Values = tuple by default, +// but they could be anything as long as References could be +// cast to Values. +// If you plan to use it with STL, for example, you will need to +// define 'swap` and `get`(aka std::get) methods. +template +class references_holder { +public: + using values = Values; + using references = References; + + C10_HOST_DEVICE + references_holder(references refs) + : refs{std::move(refs)} + {} + + C10_HOST_DEVICE + operator references() { + return refs; + } + + C10_HOST_DEVICE + operator values() { + return refs; + } + + C10_HOST_DEVICE + references_holder& operator=(values vals) { + refs = vals; + return *this; + } + + C10_HOST_DEVICE + references& data() { + return refs; + } + +protected: + references refs; +}; + +// CompositeRandomAccessor is essentially a simplified version of +// a random access iterator over two random access iterators. +// TupleInfo should contain a variadic type `tuple`, and a method `tie`, +// which constructs a tuple of references from a variadic list of arguments. +template +class CompositeRandomAccessor { + using self_type = CompositeRandomAccessor; + + using key_accessor_value_type = + typename std::iterator_traits::value_type; + using value_accessor_value_type = + typename std::iterator_traits::value_type; + using key_accessor_reference_type = + typename std::iterator_traits::reference; + using value_accessor_reference_type = + typename std::iterator_traits::reference; + + using composite_value_type = typename TupleInfo::template tuple< + key_accessor_value_type, + value_accessor_value_type>; + using composite_reference = typename TupleInfo::template tuple< + key_accessor_reference_type, + value_accessor_reference_type>; + +public: + using value_type = composite_value_type; + using reference = references_holder; + // Note that CompositeRandomAccessor does not hold key and values + // in a specific datastructure, which means that a pointer to a (key, value) + // is not defined. Hence we just use a pointer type of the KeyAccessor. + using pointer = typename std::iterator_traits::pointer; + using difference_type = typename std::iterator_traits::difference_type; + using iterator_category = std::random_access_iterator_tag; + + C10_HOST_DEVICE + CompositeRandomAccessor() = default; + + C10_HOST_DEVICE + CompositeRandomAccessor(KeyAccessor keys, ValueAccessor values) + : keys(keys), values(values) + {} + + // Pointer-like operations { + C10_HOST_DEVICE + reference operator*() const { + return TupleInfo::tie(*keys, *values); + } + + // operator->() is supposed to return a pointer type. + // Since CompositeRandomAccessor does not hold pointers to pairs, + // we just return a pointer to a key. + C10_HOST_DEVICE + auto* operator->() const { + return keys.operator->(); + } + + C10_HOST_DEVICE + reference operator[](difference_type idx) { + return operator_brackets_proxy( + CompositeRandomAccessor(keys + idx, values + idx) + ); + } + // } + + // Prefix/postfix increment/decrement { + C10_HOST_DEVICE + CompositeRandomAccessor& operator++() { + ++keys; + ++values; + return *this; + } + + C10_HOST_DEVICE + CompositeRandomAccessor operator++(int) { + CompositeRandomAccessor copy(*this); + ++*this; + return copy; + } + + C10_HOST_DEVICE + CompositeRandomAccessor& operator--() { + --keys; + --values; + return *this; + } + + C10_HOST_DEVICE + CompositeRandomAccessor operator--(int) { + CompositeRandomAccessor copy(*this); + --*this; + return copy; + } + // } + + // Arithmetic operations { + C10_HOST_DEVICE + CompositeRandomAccessor& operator+=(difference_type offset) { + keys += offset; + values += offset; + return *this; + } + + C10_HOST_DEVICE + CompositeRandomAccessor operator+(difference_type offset) const { + return CompositeRandomAccessor(keys + offset, values + offset); + } + + C10_HOST_DEVICE + friend CompositeRandomAccessor operator+( + difference_type offset, + const CompositeRandomAccessor& accessor + ) { + return accessor + offset; + } + + C10_HOST_DEVICE + CompositeRandomAccessor& operator-=(difference_type offset) { + keys -= offset; + values -= offset; + return *this; + } + + C10_HOST_DEVICE + CompositeRandomAccessor operator-(difference_type offset) const { + return CompositeRandomAccessor(keys - offset, values - offset); + } + + C10_HOST_DEVICE + difference_type operator-(const CompositeRandomAccessor& other) const { + return keys - other.keys; + } + // } + + // Comparison operators { + C10_HOST_DEVICE + bool operator==(const CompositeRandomAccessor& other) const { + return keys == other.keys; + } + + C10_HOST_DEVICE + bool operator!=(const CompositeRandomAccessor& other) const { + return keys != other.keys; + } + + C10_HOST_DEVICE + bool operator<(const CompositeRandomAccessor& other) const { + return keys < other.keys; + } + + C10_HOST_DEVICE + bool operator<=(const CompositeRandomAccessor& other) const { + return keys <= other.keys; + } + + C10_HOST_DEVICE + bool operator>(const CompositeRandomAccessor& other) const { + return keys > other.keys; + } + + C10_HOST_DEVICE + bool operator>=(const CompositeRandomAccessor& other) const { + return keys >= other.keys; + } + // } + +protected: + KeyAccessor keys; + ValueAccessor values; +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ConvUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ConvUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..a5ced6932024060b664136205e8317ac4b930ef0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ConvUtils.h @@ -0,0 +1,480 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +namespace at::native { + +using conv_depthwise2d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(conv_depthwise2d_backward_fn, conv_depthwise2d_backward_stub) +using conv_depthwise3d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(conv_depthwise3d_backward_fn, conv_depthwise3d_backward_stub) +using cudnn_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, bool, std::array); +DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub) +using mps_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub) +using cudnn_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array); +DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub) +using miopen_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub) +using miopen_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub) +using miopen_depthwise_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, bool, bool, std::array); +DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub) +using mkldnn_convolution_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub) +using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const std::optional&, + IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t); +DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub) +using mkldnn_convolution_transpose_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, int64_t, std::array); +DECLARE_DISPATCH(mkldnn_convolution_transpose_backward_fn, mkldnn_convolution_transpose_backward_stub) +using slow_conv_dilated2d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_dilated2d_backward_fn, slow_conv_dilated2d_backward_stub) +using slow_conv_dilated3d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_dilated3d_backward_fn, slow_conv_dilated3d_backward_stub) +using slow_conv_transpose2d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_transpose2d_backward_fn, slow_conv_transpose2d_backward_stub) +using slow_conv_transpose3d_backward_fn = std::tuple(*)( + const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef, + at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array); +DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub) + +namespace { + bool is_cudnnv8_heuristic_mode_b() { + static const bool is_cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true; + return is_cudnnv8_heuristic_mode_b; + } +} + +inline bool cudnnv8_enabled_check_debug() { + static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true; + static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true; + static uint8_t cudnnv8_debugcount = 0; + if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) { + TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", is_cudnnv8_heuristic_mode_b()); + cudnnv8_debugcount++; + } + return cudnnv8_flag == 1; +} + +inline bool cudnnv8_use_heur_mode_b() { + return is_cudnnv8_heuristic_mode_b(); +} + +// Keep in sync with py::enum_ in Module.cpp +enum class ConvBackend { + CudaDepthwise2d, + CudaDepthwise3d, + Cudnn, + CudnnTranspose, + Empty, + Miopen, + MiopenDepthwise, + MiopenTranspose, + Mkldnn, + MkldnnTranspose, + MkldnnEmpty, + NnpackSpatial, + Overrideable, + Slow2d, + Slow3d, + SlowDilated2d, + SlowDilated3d, + SlowTranspose2d, + SlowTranspose3d, + Winograd3x3Depthwise, + Xnnpack2d, + Mps, + MpsTranspose, +}; + +// Overload for selecting the convolution backend from the full set of convolution inputs. +// This overload is exposed to python for testing, etc. +TORCH_API ConvBackend select_conv_backend( + const Tensor& input, const Tensor& weight, const std::optional& bias_opt, + SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, + bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt); + +TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input, + const Tensor& weight, + const ConvBackend backend); + +// --------------------------------------------------------------------- +// +// Math +// +// --------------------------------------------------------------------- + +constexpr int input_batch_size_dim = 0; // also grad_input +constexpr int input_channels_dim = 1; +constexpr int output_batch_size_dim = 0; // also grad_output +constexpr int output_channels_dim = 1; +constexpr int weight_output_channels_dim = 0; +constexpr int weight_input_channels_dim = 1; + +// Often written as 2 + max_dim (extra dims for batch size and channels) +constexpr int max_dim = 3; + +// --------------------------------------------------------------------- +// +// Checking +// +// --------------------------------------------------------------------- + +// Used on pad, stride and dilation +static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name) +{ + TORCH_CHECK(args.size() <= expected_size, + "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + TORCH_CHECK(args.size() >= expected_size, + "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ", + expected_size, " (while checking arguments for ", c, ")"); + + auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;}); + if (num_negative_values > 0){ + std::stringstream ss; + ss << arg_name << " should be greater than zero but got ("; + std::copy(args.begin(), args.end() - 1, std::ostream_iterator(ss,", ")); + ss << args.back() << ")" << " (while checking arguments for " << c << ')'; + TORCH_CHECK(false, ss.str()); + } +} + + +// NOTE [ Convolution checks ] +// +// NB: For many call sites, it is not strictly necessary to check all of +// these relationships (for example, for forward convolution, we compute +// the size of output ourselves, so we don't actually need to check +// output. However, writing a single function that does everything +// means we get to reuse it for both forwards and all backwards +// variants, even when the set of "real" inputs varies. The magic of +// relational computing! +// +// (There is one downside, which is that it is slightly harder to write +// error messages which are able to distinguish between real inputs +// (which the user can change) and computed inputs (which the user can +// only indirectly affect). It would be an interesting exercise to +// come up with a general framework to handle such situations.) +inline void convolution_shape_check( + CheckedFrom c, + const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) +{ + check_args(c, padding, input->dim() - 2, "padding"); + check_args(c, stride, padding.size(), "stride"); + check_args(c, dilation, padding.size(), "dilation"); + + // Input + checkDimRange(c, input, 3, 6 /* exclusive */); + checkSize_symint(c, input, input_channels_dim, weight->size(1) * groups); + + // Weight + checkSameDim(c, input, weight); + + // TODO: check that output->size() matches output_sizes + // TODO: check that weight matches output->sizes() + checkSameDim(c, input, output); +} + +// NB: conv_output_size and conv_input_size are not bijections, +// as conv_output_size loses information; this is why conv_input_size +// takes an extra output_padding argument to resolve the ambiguity. + +template +inline std::vector _conv_output_size( + ArrayRef input_size, ArrayRef weight_size, + ArrayRef padding, ArrayRef stride, ArrayRef dilation = ArrayRef() +) { + // ASSERT(input_size.size() > 2) + // ASSERT(input_size.size() == weight_size.size()) + bool has_dilation = !dilation.empty(); + auto dim = input_size.size(); + std::vector output_size(dim); + output_size[0] = input_size[input_batch_size_dim]; + output_size[1] = weight_size[weight_output_channels_dim]; + for (const auto d : c10::irange(2, dim)) { + auto dilation_ = has_dilation ? dilation[d - 2] : 1; + auto kernel = dilation_ * (weight_size[d] - 1) + 1; + output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1; + } + return output_size; +} + +inline std::vector conv_output_size( + IntArrayRef input_size, IntArrayRef weight_size, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef() +) { + return _conv_output_size(input_size, weight_size, padding, stride, dilation); +} + +inline std::vector conv_output_size( + SymIntArrayRef input_size, SymIntArrayRef weight_size, + SymIntArrayRef padding, SymIntArrayRef stride, SymIntArrayRef dilation = SymIntArrayRef() +) { + return _conv_output_size(input_size, weight_size, padding, stride, dilation); +} + +template +std::vector _conv_input_size( + ArrayRef output_size, ArrayRef weight_size, + ArrayRef padding, ArrayRef output_padding, ArrayRef stride, ArrayRef dilation, T groups +) { + // ASSERT(output_size.size() > 2) + // ASSERT(output_size.size() == weight_size.size()) + auto dim = output_size.size(); + std::vector input_size(dim); + input_size[0] = output_size[output_batch_size_dim]; + input_size[1] = weight_size[weight_input_channels_dim] * groups; + for (const auto d : c10::irange(2, dim)) { + auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1; + input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) + + kernel + output_padding[d - 2]; + } + return input_size; +} + +inline std::vector conv_input_size( + SymIntArrayRef output_size, SymIntArrayRef weight_size, + SymIntArrayRef padding, SymIntArrayRef output_padding, SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups +) { + return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, std::move(groups)); +} + +inline std::vector conv_input_size( + IntArrayRef output_size, IntArrayRef weight_size, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups); +} + +template +std::vector _conv_weight_size( + ArrayRef input_size, ArrayRef output_size, + ArrayRef padding, ArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + auto dim = input_size.size(); + std::vector weight_size(dim); + weight_size[0] = output_size[1]; + weight_size[1] = input_size[1] / groups; + for (const auto d : c10::irange(2, dim)) { + auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2] + + padding[d - 2] * 2 - output_padding[d - 2]; + weight_size[d] = (kernel - 1) / dilation[d - 2] + 1; + } + return weight_size; +} + +inline std::vector conv_weight_size( + SymIntArrayRef input_size, SymIntArrayRef output_size, + SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups); +} + +inline std::vector conv_weight_size( + IntArrayRef input_size, IntArrayRef output_size, + IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups +) { + return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups); +} + +inline Tensor reshape_bias(int64_t dim, const Tensor& bias) { + std::vector shape(dim, 1); + shape[1] = -1; + return bias.reshape(shape); +} + +inline at::MemoryFormat cudnn_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) { + // disable NHWC for float64 input. + if (!at::detail::getCUDAHooks().compiledWithCuDNN() || + input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return at::MemoryFormat::Contiguous; + } + long cudnn_version = at::detail::getCUDAHooks().versionCuDNN(); + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + auto weight_ndim = weight.ndimension(); + + bool can_use_cudnn_channels_last_2d = (cudnn_version >= 7603) && (weight_ndim == 4) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast) + ); + if (can_use_cudnn_channels_last_2d) { + return at::MemoryFormat::ChannelsLast; + } + + bool can_use_cudnn_channels_last_3d = (cudnn_version >= 8005) && (weight_ndim == 5) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d) + ); + if (can_use_cudnn_channels_last_3d) { + return at::MemoryFormat::ChannelsLast3d; + } + + return at::MemoryFormat::Contiguous; +} + +// controls whether emptyCache will be called following cudnn conv benchmarking +TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable); +TORCH_API bool _cudnn_get_conv_benchmark_empty_cache(); + + +inline at::MemoryFormat miopen_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) { + // disable NHWC for float64 input. + if (!at::detail::getCUDAHooks().compiledWithMIOpen() || + input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return at::MemoryFormat::Contiguous; + } + + // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen + // See https://github.com/pytorch/pytorch/issues/64427. + // non static variable is used to be able to change environment variable in runtime for testing + // enabled by default for ROCm >= 7.0.0 with miopen 3.5 + int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0; + bool is_miopen_3_5 = miopen_version >= 30500; // ROCm 7.0 + bool suggest_nhwc = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC").value_or(is_miopen_3_5); + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + auto weight_ndim = weight.ndimension(); + + bool can_use_miopen_channels_last_2d = suggest_nhwc && (weight_ndim == 4) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast) + ); + if (can_use_miopen_channels_last_2d) { + return at::MemoryFormat::ChannelsLast; + } + + bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && ( + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d) + ); + if (can_use_miopen_channels_last_3d) { + return at::MemoryFormat::ChannelsLast3d; + } + + return at::MemoryFormat::Contiguous; +} + +// deprecated, but to remove would be BC-breaking +inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + return miopen_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous; +} + +inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // disable NHWC for float64 input. + if (input.scalar_type() == at::kDouble || + weight.scalar_type() == at::kDouble) { + return false; + } + + // disable NHWC for MkldnnCPU tensor. + if (input.is_mkldnn() || weight.is_mkldnn()) { + return false; + } + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_mkldnn_channels_last_2d = + (input_memory_format == at::MemoryFormat::ChannelsLast) || + (weight_memory_format == at::MemoryFormat::ChannelsLast); + + bool can_use_mkldnn_channels_last_3d = + (input_memory_format == at::MemoryFormat::ChannelsLast3d) || + (weight_memory_format == at::MemoryFormat::ChannelsLast3d); + + return can_use_mkldnn_channels_last_2d || can_use_mkldnn_channels_last_3d; +} + +inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + auto input_memory_format = input.suggest_memory_format(); + auto weight_memory_format = weight.suggest_memory_format(); + + bool can_use_thnn_channels_last_2d = input.device().is_cpu() && ( + (input_memory_format == at::MemoryFormat::ChannelsLast) || ( + weight_memory_format == at::MemoryFormat::ChannelsLast)); + + return can_use_thnn_channels_last_2d; +} + +inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // check layout only for xpu tensor. + if (!input.is_xpu() || !weight.is_xpu()) { + return false; + } + if (!input.defined() || input.is_sparse()) { + // suggest channels_first + return false; + } + + auto is_channel_last = [](const at::Tensor& t) { + auto fmt = t.suggest_memory_format(); + return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d; + }; + return is_channel_last(input) || is_channel_last(weight); +} + +inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + + // check layout only for mps tensor. + if (!input.is_mps() || !weight.is_mps()) { + return false; + } + if (!input.defined() || input.is_sparse()) { + // suggest channels_first + return false; + } + + auto is_channel_last = [](const at::Tensor& t) { + auto fmt = t.suggest_memory_format(); + return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d; + }; + return is_channel_last(input) || is_channel_last(weight); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ConvolutionMM3d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ConvolutionMM3d.h new file mode 100644 index 0000000000000000000000000000000000000000..75342eb5d919c4068ba2d912c138c6f02cd523a9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ConvolutionMM3d.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +namespace at::native { + +std::tuple slow_conv3d_backward_cpu( + const Tensor& grad_output, + const Tensor& self, + const Tensor& weight, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + std::array output_mask); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Copy.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..33ed6063a7a2fe721525855481a2b62e98760029 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Copy.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at { + +class Tensor; +struct TensorIterator; +class TensorBase; + +namespace native { + +using copy_fn = void (*)(TensorIterator&, bool non_blocking); + +DECLARE_DISPATCH(copy_fn, copy_stub) + +TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src); + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Cross.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Cross.h new file mode 100644 index 0000000000000000000000000000000000000000..2100156a2dd23e0da0df84bcb23bd21b173a2bf5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Cross.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at { +class Tensor; + +namespace native { + +using cross_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const int64_t d); + +DECLARE_DISPATCH(cross_fn, cross_stub) + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..fda88a764833121e58019f9c6142dfa016343ef6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h @@ -0,0 +1,234 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include +#include +#include + +#define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \ + TORCH_CHECK( \ + T.dim() == DIM && T.size(DIM_SIZE) == SIZE, \ + "Need " #T " of dimension ", \ + DIM, \ + " and " #T ".size[", \ + DIM_SIZE, \ + "] == ", \ + SIZE, \ + " but got input to be of shape ", \ + T.sizes()) + +namespace at::native::internal { +namespace { +inline bool all_positive(IntArrayRef& arr) { + return std::all_of( + arr.begin(), arr.end(), [](int64_t item) { return item > 0; }); +} + +inline bool all_nonnegative(std::vector& arr) { + return std::all_of( + arr.begin(), arr.end(), [](int64_t item) { return item >= 0; }); +} + +} // namespace + +// calculate the rear part of output tensor sizes +template +std::vector get_output_size( + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef stride_size, + IntArrayRef pad_size, + IntArrayRef dilation_size) { + std::vector sizes; + for (const auto index : c10::irange(dim)) { + sizes.push_back( + div_rtn( + input.size(index + input.dim() - dim) + 2 * pad_size[index] - + (dilation_size[index] * (kernel_size[index] - 1) + 1), + stride_size[index]) + + 1); + } + return sizes; +} + +// calculate the sizes of output tensor +template +std::vector get_output_size( + const Tensor& input, + const Tensor& weight, + IntArrayRef kernel_size, + IntArrayRef stride_size, + IntArrayRef pad_size, + IntArrayRef dilation_size) { + auto output_size = get_output_size( + input, kernel_size, stride_size, pad_size, dilation_size); + output_size.insert(output_size.begin(), weight.size(0)); + if (input.dim() == dim + 2) { + output_size.insert(output_size.begin(), input.size(0)); + } + return output_size; +} +/* + slow_conv_dilated_shape_check - check user-input to dilated convolution + forward and backward functions. +*/ +template +void slow_conv_dilated_shape_check( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + const Tensor& grad_output, + IntArrayRef kernel_size, + IntArrayRef stride_size, + IntArrayRef pad_size, + IntArrayRef dilation_size) { + /* + When the following tensors are defined: + + bias, grad_weight, grad_output + + then these are assumed to be contiguous without checking + because of these tensors are made contiguous by calling + .contiguous() method or by resizing of zero-sized tensors in + forward/backward functions. + + When grad_weight is defined then it is assumed without + checking to have the same shape as weight, see backward + functions. + */ + // Check size arguments + TORCH_CHECK( + kernel_size.size() == dim, + "kernel sizes length should be ", + dim, + ", but got ", + kernel_size.size()); + TORCH_CHECK( + stride_size.size() == dim, + "strides length should be ", + dim, + ", but got ", + stride_size.size()); + TORCH_CHECK( + dilation_size.size() == dim, + "dilations length should be ", + dim, + ", but got ", + dilation_size.size()); + TORCH_CHECK( + pad_size.size() == dim, + "pads length should be ", + dim, + ", but got ", + pad_size.size()); + + TORCH_CHECK( + all_positive(kernel_size), + "kernel size should be greater than zero, but got ", + kernel_size); + TORCH_CHECK( + all_positive(stride_size), + "stride should be greater than zero, but got ", + stride_size); + TORCH_CHECK( + all_positive(dilation_size), + "dilation should be greater than zero, but got ", + dilation_size); + + // check input + TORCH_CHECK(input.defined(), "input must be defined"); + bool is_batch = input.dim() == dim + 2; + int64_t n = (is_batch ? 2 : 1); + int64_t ndim = n + dim; + if (!is_batch) { + // input dim has to be dim + 1 if not batched + TORCH_CHECK( + input.dim() == dim + 1, + "input must be 4D or 5D tensor but got ", + input.dim(), + "D tensor"); + } + + // check output sizes + auto output_size = get_output_size( + input, kernel_size, stride_size, pad_size, dilation_size); + + TORCH_CHECK( + all_nonnegative(output_size), + "calculated output size ", + output_size, + " is too small (all sizes must be non-negative)"); + + // check weight + TORCH_CHECK(weight.defined(), "weight must be defined"); + TORCH_CHECK( + weight.dim() == dim + 2, + "weight must be ", + dim + 2, + "D tensor but got ", + weight.dim(), + "D tensor dim=", + dim); + TORCH_CHECK( + weight.sizes().slice(2) == kernel_size, + "weight[2:] shape ", + weight.sizes().slice(2), + " must be equal to kernel_size ", + kernel_size); + + TORCH_CHECK_DIM_SIZE(input, input.dim(), (is_batch ? 1 : 0), weight.size(1)); + + // check bias when present + if (bias.defined()) { + TORCH_CHECK( + bias.dim() == 1, + "bias must be 1D tensor but got ", + bias.dim(), + "D tensor"); + TORCH_CHECK_DIM_SIZE(bias, 1, 0, weight.size(0)); + } + + // check grad_output when present + if (grad_output.defined()) { + TORCH_CHECK( + grad_output.dim() == ndim, + "grad_output must be ", + ndim, + "D tensor but got ", + grad_output.dim(), + "D tensor"); + if (is_batch) { + TORCH_CHECK( + grad_output.size(0) == input.size(0), + "grad_output.size(0)=", + grad_output.size(0), + " must be input.size(0)=", + input.size(0)); + } + TORCH_CHECK( + grad_output.size(n - 1) == weight.size(0), + "grad_output.size(", + n - 1, + ")=", + grad_output.size(n - 1), + " must be weight.size(0)=", + weight.size(0)); + TORCH_CHECK( + grad_output.sizes().slice(n) == output_size, + "grad_output[", + n, + ":] shape", + grad_output.sizes().slice(n), + " must be equal to output size ", + output_size); + } +} + +} // namespace at::native::internal + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/DispatchStub.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/DispatchStub.h new file mode 100644 index 0000000000000000000000000000000000000000..a57093238a8beea1b2d285ff17db067865eb6992 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/DispatchStub.h @@ -0,0 +1,500 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include +#include +#include + +// Implements instruction set specific function dispatch. +// +// Kernels that may make use of specialized instruction sets (e.g. AVX2) are +// compiled multiple times with different compiler flags (e.g. -mavx2). A +// DispatchStub contains a table of function pointers for a kernel. At runtime, +// the fastest available kernel is chosen based on the features reported by +// cpuinfo. +// +// Example: +// +// In native/MyKernel.h: +// using fn_type = void(*)(const Tensor& x); +// DECLARE_DISPATCH(fn_type, stub) +// +// In native/MyKernel.cpp +// DEFINE_DISPATCH(stub); +// +// In native/cpu/MyKernel.cpp: +// namespace { +// // use anonymous namespace so that different cpu versions won't conflict +// void kernel(const Tensor& x) { ... } +// } +// REGISTER_DISPATCH(stub, &kernel); +// +// To call: +// stub(kCPU, tensor); +// +// TODO: CPU instruction set selection should be folded into whatever +// the main dispatch mechanism is. +// +// Supported device types for registration: +// - CPU: Central Processing Unit +// - CUDA: NVIDIA GPUs +// - HIP: AMD GPUs +// - MPS: Apple Silicon GPUs (Metal Performance Shaders) +// - MTIA: Meta Training and Inference Devices +// - XPU: Intel GPUs +// - HPU: Reserved for HPU (Intel Gaudi) device types +// - PrivateUse1: Reserved for private/custom device types +// +// If you want to update the list of supported devices, add a new dispatch_ptr +// member in DispatchStubImpl.h and update the get_call_ptr switch. +// As well you will need to update the inlined list in 'is_device_supported` +// +// +// ignore warnings about DispatchStub::DEFAULT, AVX, AVX2 defined elsewhere +C10_CLANG_DIAGNOSTIC_PUSH() +C10_CLANG_DIAGNOSTIC_IGNORE("-Wundefined-var-template") + +namespace at::native { + +enum class CPUCapability { + DEFAULT = 0, +#if defined(HAVE_VSX_CPU_DEFINITION) + VSX = 1, +#elif defined(HAVE_ZVECTOR_CPU_DEFINITION) + ZVECTOR = 1, +#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION) + SVE256 = 1, +#else + AVX2 = 1, + AVX512 = 2, +#endif + NUM_OPTIONS +}; + +// Enum for error types +enum class ErrorType { + MissingDeviceKernel, + DeviceNotSupported +}; + +// Alias for the return type using std::variant +using DispatchResult = std::variant; + +CPUCapability get_cpu_capability(); + +template +struct DispatchStub; + +/** + * The sole purpose of this class is to outline methods that don't need to be + * specialized or otherwise inlined and duplicated (by the compiler due to + * template expansion), since it causes size bloat if there are a significant + * number of specialization of the DispatchStub<> class. + */ +struct TORCH_API DispatchStubImpl { + + // The DispatchStubImpl::try_get_call_ptr() method is used to get the call + // pointer for a given device type. If the call pointer is not found, + // DispatchStubImpl::try_get_call_ptr() returns an ErrorType. + // The main difference between try_get_call_ptr() and get_call_ptr() is that + // try_get_call_ptr() will return the ErrorType and not raise an exception. + DispatchResult try_get_call_ptr( + c10::DeviceType device_type + , void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif +#ifdef HAVE_SVE256_CPU_DEFINITION + , void *SVE256 +#endif + ); + + // Analogous to try_get_call_ptr(), but it will return the ErrorType and not + // raise an exception. + DispatchResult try_choose_cpu_impl( + void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif +#ifdef HAVE_SVE256_CPU_DEFINITION + , void *SVE256 +#endif + ); + + + void* get_call_ptr( + c10::DeviceType device_type + , void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif +#ifdef HAVE_SVE256_CPU_DEFINITION + , void *SVE256 +#endif + ); + + /** + * The CPU Dispatch actual method is chosen in decreasing order of preference by + * DispatchStubImpl::choose_cpu_impl() in case none is found by + * DispatchStubImpl::get_call_ptr() in cpu_dispatch_ptr. + */ + void* choose_cpu_impl( + void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif +#ifdef HAVE_SVE256_CPU_DEFINITION + , void *SVE256 +#endif + ); + + // Fixing dispatch error in Windows debug builds. + // See https://github.com/pytorch/pytorch/issues/22681 for more details. + #if defined(_MSC_VER) && defined(_DEBUG) + std::atomic cpu_dispatch_ptr; + void* cuda_dispatch_ptr; + void* hip_dispatch_ptr; + void* mps_dispatch_ptr; + void* mtia_dispatch_ptr; + #if defined(USE_XPU) + void* xpu_dispatch_ptr; + #endif + void* hpu_dispatch_ptr; + void* privateuse1_dispatch_ptr; + #else + std::atomic cpu_dispatch_ptr{nullptr}; + void* cuda_dispatch_ptr = nullptr; + void* hip_dispatch_ptr = nullptr; + void* mps_dispatch_ptr = nullptr; + void* mtia_dispatch_ptr = nullptr; + #if defined(USE_XPU) + void* xpu_dispatch_ptr = nullptr; + #endif + void* hpu_dispatch_ptr = nullptr; + void* privateuse1_dispatch_ptr = nullptr; + #endif +}; + +template +struct DispatchStub { + using FnPtr = rT (*) (Args...); + + DispatchStub() = default; + DispatchStub(const DispatchStub&) = delete; + DispatchStub& operator=(const DispatchStub&) = delete; + +private: + FnPtr get_call_ptr(const c10::DeviceType device_type) { + return reinterpret_cast( + impl.get_call_ptr(device_type + , reinterpret_cast(DEFAULT) +#ifdef HAVE_AVX512_CPU_DEFINITION + , reinterpret_cast(AVX512) +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , reinterpret_cast(AVX2) +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , reinterpret_cast(VSX) +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , reinterpret_cast(ZVECTOR) +#endif +#ifdef HAVE_SVE256_CPU_DEFINITION + , reinterpret_cast(SVE256) +#endif + ) + ); + } + +public: + template + rT operator()(c10::DeviceType device_type, ArgTypes&&... args) { + FnPtr call_ptr = get_call_ptr(device_type); + return (*call_ptr)(std::forward(args)...); + } + + void set_cuda_dispatch_ptr(FnPtr fn_ptr) { + impl.cuda_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + #if defined(USE_XPU) + void set_xpu_dispatch_ptr(FnPtr fn_ptr){ + impl.xpu_dispatch_ptr = reinterpret_cast(fn_ptr); + } + #endif + + void set_hpu_dispatch_ptr(FnPtr fn_ptr) { + impl.hpu_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_hip_dispatch_ptr(FnPtr fn_ptr) { + impl.hip_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_mps_dispatch_ptr(FnPtr fn_ptr) { + impl.mps_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_mtia_dispatch_ptr(FnPtr fn_ptr) { + impl.mtia_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_privateuse1_dispatch_ptr(FnPtr fn_ptr) { + impl.privateuse1_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + // Returns true if the dispatcher has a kernel registered for this device + // type. + bool is_device_supported(const c10::DeviceType device_type) { + auto result = impl.try_get_call_ptr(device_type + , reinterpret_cast(DEFAULT) +#ifdef HAVE_AVX512_CPU_DEFINITION + , reinterpret_cast(AVX512) +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , reinterpret_cast(AVX2) +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , reinterpret_cast(VSX) +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , reinterpret_cast(ZVECTOR) +#endif +#ifdef HAVE_SVE256_CPU_DEFINITION + , reinterpret_cast(SVE256) +#endif + ); + if (std::holds_alternative(result)){ + return false; + } + return true; + } + + static TORCH_API FnPtr DEFAULT; +#ifdef HAVE_AVX512_CPU_DEFINITION + static TORCH_API FnPtr AVX512; +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + static TORCH_API FnPtr AVX2; +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + static TORCH_API FnPtr VSX; +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + static TORCH_API FnPtr ZVECTOR; +#endif +#ifdef HAVE_SVE256_CPU_DEFINITION + static TORCH_API FnPtr SVE256; +#endif +private: + DispatchStubImpl impl; +}; + +namespace { +template +struct RegisterCUDADispatch { + RegisterCUDADispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_cuda_dispatch_ptr(value); + } +}; + +template +struct RegisterXPUDispatch { + RegisterXPUDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value){ + stub.set_xpu_dispatch_ptr(value); + } +}; + +template +struct RegisterHPUDispatch { + RegisterHPUDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value){ + stub.set_hpu_dispatch_ptr(value); + } +}; + +template +struct RegisterMPSDispatch { + RegisterMPSDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_mps_dispatch_ptr(value); + } +}; + +template +struct RegisterHIPDispatch { + RegisterHIPDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + // TODO: make this point at hip_dispatch_ptr + stub.set_cuda_dispatch_ptr(value); + } +}; + +template +struct RegisterMTIADispatch { + RegisterMTIADispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_mtia_dispatch_ptr(value); + } +}; + +template +struct RegisterPRIVATEUSE1Dispatch { + RegisterPRIVATEUSE1Dispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_privateuse1_dispatch_ptr(value); + } +}; + +} // anonymous namespace +// Compiler will complain if you put things like std::tuple in +// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g., +// adding parentheses and using helper struct to get rid of the parentheses, do +// not work with MSVC. So do a `using`-declaration if you need to pass in such +// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h. +#define DECLARE_DISPATCH(fn, name) \ + struct name##_DECLARE_DISPATCH_type : DispatchStub { \ + name##_DECLARE_DISPATCH_type() = default; \ + name##_DECLARE_DISPATCH_type(const name##_DECLARE_DISPATCH_type&) = delete; \ + name##_DECLARE_DISPATCH_type& operator=(const name##_DECLARE_DISPATCH_type&) = delete; \ + name##_DECLARE_DISPATCH_type(name##_DECLARE_DISPATCH_type&&) = delete; \ + name##_DECLARE_DISPATCH_type& operator=(name##_DECLARE_DISPATCH_type&&) = delete; \ + ~name##_DECLARE_DISPATCH_type() = default; \ + }; \ + extern TORCH_API struct name##_DECLARE_DISPATCH_type name; + +#define DEFINE_DISPATCH(name) struct name##_DECLARE_DISPATCH_type name + +#define REGISTER_ARCH_DISPATCH(name, arch, fn) \ + template <> name##_DECLARE_DISPATCH_type::FnPtr TORCH_API DispatchStub::arch = fn; + +#ifdef HAVE_AVX512_CPU_DEFINITION +#define REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX512, fn) +#else +#define REGISTER_AVX512_DISPATCH(name, fn) +#endif + +#ifdef HAVE_AVX2_CPU_DEFINITION +#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn) +#else +#define REGISTER_AVX2_DISPATCH(name, fn) +#endif + +#ifdef HAVE_VSX_CPU_DEFINITION +#define REGISTER_VSX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, VSX, fn) +#else +#define REGISTER_VSX_DISPATCH(name, fn) +#endif + +#ifdef HAVE_ZVECTOR_CPU_DEFINITION +#define REGISTER_ZVECTOR_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, ZVECTOR, fn) +#else +#define REGISTER_ZVECTOR_DISPATCH(name, fn) +#endif + +#ifdef HAVE_SVE256_CPU_DEFINITION +#define REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE256, fn) +#else +#define REGISTER_SVE256_DISPATCH(name, fn) +#endif + +// Macro to register the same kernel for all CPU arch types. This is useful +// if a kernel does not benefit from being recompiled across different arch types. +#define REGISTER_ALL_CPU_DISPATCH(name, fn) \ + REGISTER_ARCH_DISPATCH(name, DEFAULT, fn) \ + REGISTER_AVX512_DISPATCH(name, fn) \ + REGISTER_AVX2_DISPATCH(name, fn) \ + REGISTER_VSX_DISPATCH(name, fn) \ + REGISTER_ZVECTOR_DISPATCH(name, fn) \ + REGISTER_SVE256_DISPATCH(name, fn) + +#define REGISTER_NO_CPU_DISPATCH(name) \ + REGISTER_ALL_CPU_DISPATCH(name, nullptr) + +#define REGISTER_CUDA_DISPATCH(name, fn) \ + static RegisterCUDADispatch name ## __register(name, fn); + +#define REGISTER_XPU_DISPATCH(name, fn) \ + static RegisterXPUDispatch name ## __register(name, fn); + +#define REGISTER_HPU_DISPATCH(name, fn) \ + static RegisterHPUDispatch name ## __register(name, fn); + +#define REGISTER_HIP_DISPATCH(name, fn) \ + static RegisterHIPDispatch name ## __register(name, fn); + +#define REGISTER_MPS_DISPATCH(name, fn) \ + static RegisterMPSDispatch name ## __register(name, fn); + +#define REGISTER_MTIA_DISPATCH(name, fn) \ + static RegisterMTIADispatch name ## __register(name, fn); + +#define REGISTER_PRIVATEUSE1_DISPATCH(name, fn) \ + static RegisterPRIVATEUSE1Dispatch name ## __register(name, fn); + +// NB: This macro must be used in an actual 'cu' file; if you try using +// it from a 'cpp' file it will not work! +#if defined(__CUDACC__) +#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn) +#elif defined(__HIPCC__) +// TODO: cut this over to HIP dispatch once we stop pretending that CUDA +// is HIP in the PyTorch HIPify build. +#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn) +// #define REGISTER_DISPATCH(name, fn) REGISTER_HIP_DISPATCH(name, fn) +#elif defined(__OBJC__) && defined(USE_MPS) +// NB: this macro must be used from a 'mm' file in order to dispatch a MPS kernel +#define REGISTER_DISPATCH(name, fn) REGISTER_MPS_DISPATCH(name, fn) +#elif defined(CPU_CAPABILITY) +// REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches. +// ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others. +// ALSO_REGISTER_SVE256_DISPATCH should be used for ensuring SVE256 dispatch, among others. +#ifdef CPU_CAPABILITY_AVX512 +#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr)) +#else +#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn) +#endif +#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn) +#define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn) +#endif +} // namespace at::native + +C10_CLANG_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Distance.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Distance.h new file mode 100644 index 0000000000000000000000000000000000000000..774434385595c7e3e3a35e1b956062f6156abb01 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Distance.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at { +class Tensor; + +namespace native { + +using pdist_forward_fn = void(*)(Tensor&, const Tensor&, const double p); +using pdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&); +using cdist_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p); +using cdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&); + +DECLARE_DISPATCH(pdist_forward_fn, pdist_forward_stub) +DECLARE_DISPATCH(pdist_backward_fn, pdist_backward_stub) +DECLARE_DISPATCH(cdist_fn, cdist_stub) +DECLARE_DISPATCH(cdist_backward_fn, cdist_backward_stub) + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/DistributionTemplates.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/DistributionTemplates.h new file mode 100644 index 0000000000000000000000000000000000000000..f97f4fb0443c400011a9af0d8ba896c8a5c1c2f5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/DistributionTemplates.h @@ -0,0 +1,399 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native::templates { + +// ==================================================== Random ======================================================== + +// The purpose of `update_from` and `update_to` is to find the closest valid int64_t number that can be used as actual `from`. +// The current implementation of `random_` uses uint64_t arithmetic and casts the result to the target dtype(scalar_t). +// This casting can result in generating numbers that happen to be greater or equal to `to` value. For instance: +// +// auto actual = torch::empty({3, 3}, torch::half); +// actual.random_(0, 65504); +// +// If random's uint64_t arithmetic produces 65503 as a random value after casting to torch::half it becomes 65504 +// and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to` +// moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to +// the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous +// available number for torch::half dtype. +template +int64_t update_from(int64_t from) { + static_assert( + std::is_floating_point_v || + std::is_same_v || + std::is_same_v, "scalar_t must be floating-point type"); + const auto from_plus_1 = static_cast(static_cast(from + 1)); + if (from_plus_1 < from) { + int64_t from_ = std::abs(from + 1); + int n = 0; + while (from_ >>= 1) ++n; + // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) + from = from_plus_1 + (1LL << (n - std::numeric_limits::digits + 1)); + } + return from; +} + +template +int64_t update_to(int64_t to) { + static_assert( + std::is_floating_point_v || + std::is_same_v || + std::is_same_v, "scalar_t must be floating-point type"); + const auto to_minus_1 = static_cast(static_cast(to - 1)); + if (to_minus_1 >= to) { + int64_t to_ = std::abs(to - 1); + int n = 0; + while (to_ >>= 1) ++n; + // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult) + to = to_minus_1 - (1LL << (n - std::numeric_limits::digits + 1)); + } + return to; +} + +// Return earlier for not invoking kernel. +// See https://github.com/pytorch/pytorch/issues/103418 for more details +#define CHECK_EMPTY_AND_RETURN(tensor) \ + if (tensor.numel() == 0) { \ + return tensor; \ + } + +template class random_kernel, typename RNG> +at::Tensor& random_impl(at::Tensor& self, std::optional generator) { + CHECK_EMPTY_AND_RETURN(self); + auto iter = at::TensorIterator::borrowing_nullary_op(self); + random_kernel()(iter, generator); + return self; +} + +#define CHECK_OUT_OF_BOUNDS(var, name, min, max, dtype) \ + TORCH_CHECK(var >= min && var <= max, name , " is out of bounds for ", dtype); \ + +#define WARN_OUT_OF_BOUNDS(var, name, digits, dtype) \ + if (var < -(1LL << digits) || var > (1LL << digits)) { \ + TORCH_WARN(name , " is out of bounds [-(2^", digits, "), 2^", digits, "]. ", \ + "Due to precision limitations ", dtype, " can support discrete uniform distribution only within this range. ", \ + "This warning will become an error in version 1.7 release, please fix the code in advance"); \ + } + +inline void check_from_to_in_range(int64_t from, int64_t to_inc, caffe2::TypeMeta dtype) { + const auto scalar_type = typeMetaToScalarType(dtype); + if (isFloatingType(scalar_type)) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "check_random_fp_bounds", [&] { + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype); + + constexpr auto digits = std::numeric_limits::digits; + WARN_OUT_OF_BOUNDS(from, "from", digits, dtype); + WARN_OUT_OF_BOUNDS(to_inc, "to - 1", digits, dtype); + }); + } else if (scalar_type == kUInt64) { + // When you do a comparison between int64_t and uint64_t, the usual + // arithmetic conversions say that the int64_t value is promoted to + // unsigned. But this conversion wraps around: if I had -1 as my int64_t, + // then it will promote to 0xFFFFFFFFFFFFFFFF in uint64_t. This is never + // the right thing to do. + CHECK_OUT_OF_BOUNDS(from, "from", 0, INT64_MAX, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", 0, INT64_MAX, dtype); + } else if (isIntegralType(scalar_type, /*includeBool=*/true)) { + AT_DISPATCH_V2(scalar_type, "check_random_integral_bounds", AT_WRAP([&]() { + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype); + }), AT_EXPAND(AT_INTEGRAL_TYPES), kUInt16, kUInt32, kBool); + } else { + TORCH_CHECK(false, "check_random_bounds handles only integral, floating-point and boolean types"); + } +} + +template class random_from_to_kernel, typename RNG> +at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, std::optional to_opt, std::optional generator) { + uint64_t range = 0; + auto iter = at::TensorIterator::borrowing_nullary_op(self); + if (to_opt.has_value()) { + // [from, to) + int64_t to = *to_opt; + TORCH_CHECK(from < to, "random_ expects 'from' to be less than 'to', but got from=", from, " >= to=", to); + if (isFloatingType(iter.dtype())) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_update_from_to", [&] { + from = update_from(from); + to = update_to(to); + TORCH_CHECK(from < to, "random_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", from, " >= to=", to); + }); + } + check_from_to_in_range(from, to - 1, self.dtype()); + CHECK_EMPTY_AND_RETURN(self); + range = static_cast(to) - static_cast(from); + random_from_to_kernel()(iter, range, from, generator); + } else if (from != std::numeric_limits::lowest()) { + // [from, std::numeric_limits::max()] + int64_t to_inc = 0; + if (isFloatingType(iter.dtype())) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_from_to_range_calc", [&] { + constexpr int64_t scalar_t_max = static_cast(1) << std::numeric_limits::digits; + to_inc = scalar_t_max > std::numeric_limits::max() ? std::numeric_limits::max() : static_cast(scalar_t_max); + from = update_from(from); + TORCH_CHECK(from < to_inc, "random_ expects 'from' casted to dtype to be less than or equal to 'to_inc' casted to dtype, but got from=", from, " > to_inc=", to_inc); + }); + } else if (isIntegralType(iter.dtype(), /*includeBool=*/true)) { + AT_DISPATCH_V2(self.scalar_type(), "random_from_to_range_calc", AT_WRAP([&] { + if constexpr (std::is_same_v) { + to_inc = static_cast(true); + } else { + to_inc = static_cast(std::numeric_limits::max()); + } + }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), kBool); + } else { + TORCH_CHECK(false, "random_from_to_impl handles only integral, floating-point and boolean types"); + } + check_from_to_in_range(from, to_inc, self.dtype()); + CHECK_EMPTY_AND_RETURN(self); + range = static_cast(to_inc) - static_cast(from) + 1; + random_from_to_kernel()(iter, range, from, generator); + } else { + // [std::numeric_limits::lowest(), std::numeric_limits::max()] + // range = 2^64 + CHECK_EMPTY_AND_RETURN(self); + random_from_to_kernel()(iter, generator); + } + return self; +} + +// ==================================================== Normal ======================================================== + +#define CHECK_NORMAL_TENSOR_STD(std) \ + do { \ + TORCH_CHECK( \ + !std.is_complex(), \ + "normal expects standard deviation to be non-complex"); \ + TORCH_CHECK( \ + std.numel() == 0 || std.is_meta() || std.min().ge(0).item(), \ + "normal expects all elements of std >= 0.0"); \ + } while (0) + +#define CHECK_NORMAL_STD(std) \ + TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std); + +template class normal_kernel, typename RNG> +Tensor& normal_impl_(Tensor& self, double mean, double std, std::optional gen) { + CHECK_NORMAL_STD(std); + CHECK_EMPTY_AND_RETURN(self); + + if (self.is_complex()) { + auto float_tensor = at::view_as_real(self); + // variance for normal distribution of the real and imaginary values + // is half of the input variance + normal_kernel()(float_tensor, mean, std/(std::sqrt(2)), gen); + } else { + normal_kernel()(self, mean, std, gen); + } + return self; +} + +template class normal_kernel, typename RNG> +Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, std::optional gen) { + CHECK_NORMAL_STD(std); + auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous); + auto shape = at::infer_size(mean.sizes(), std_tensor.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, std, gen); + output.add_(mean); + return output; +} + +template class normal_kernel, typename RNG> +Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, std::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + auto mean_tensor = at::full({}, mean, output.options()); + auto shape = at::infer_size(mean_tensor.sizes(), std.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, 1, gen); + // CUDA NB: addcmul_out copies the tensor to be added into the output. + // The previous function here was addcmul_out(output, mean_tensor, output, std, 1); + // The third argument is not a constant reference and hence the samples in output are overwritten. + // Consequently, the computation performed is mean_tensor + mean_tensor * std instead of mean_tensor + output * std + output.mul_(std).add_(mean_tensor); + return output; +} + +template class normal_kernel, typename RNG> +Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, std::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + auto shape = at::infer_size(mean.sizes(), std.sizes()); + at::native::resize_output(output, shape); + normal_impl_(output, 0, 1, gen); + // CUDA NB: addcmul_out copies the tensor to be added into the output. + // The previous function here was addcmul_out(output, mean, output, std, 1); + // The third argument is not a constant reference and hence the samples in output are overwritten. + // Consequently, the computation performed is mean + mean * std instead of mean + output * std + output.mul_(std).add_(mean); + return output; +} + +template class normal_kernel, typename RNG> +Tensor normal_impl(const Tensor& mean, double std, std::optional gen) { + CHECK_NORMAL_STD(std); + Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous); + normal_out_impl(ret, mean, std, gen); + return ret; +} + +template class normal_kernel, typename RNG> +Tensor normal_impl(double mean, const Tensor& std, std::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + Tensor ret = at::empty_like(std, MemoryFormat::Contiguous); + normal_out_impl(ret, mean, std, gen); + return ret; +} + +template class normal_kernel, typename RNG> +Tensor normal_impl(const Tensor& mean, const Tensor& std, std::optional gen) { + CHECK_NORMAL_TENSOR_STD(std); + auto shape = at::infer_size(mean.sizes(), std.sizes()); + Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous); + normal_out_impl(ret, mean, std, gen); + return ret; +} + +// ==================================================== Uniform ======================================================= + +template class uniform_kernel, typename RNG> +at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, std::optional generator) { + if (self.is_complex()) { + CHECK_EMPTY_AND_RETURN(self); + auto float_tensor = at::view_as_real(self); + uniform_impl_(float_tensor, from, to, generator); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "check_uniform_bounds", [&] { + [[maybe_unused]] const auto dtype = self.dtype(); + const auto min = static_cast(std::numeric_limits::lowest()); + const auto max = static_cast(std::numeric_limits::max()); + CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype); + CHECK_OUT_OF_BOUNDS(to, "to", min, max, dtype); + TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to); + TORCH_CHECK((to - from) <= std::numeric_limits::max(), + "uniform_ expects to-from <= std::numeric_limits<", toString(self.scalar_type()), + ">::max(), but found to=", to, " and from=", from, + " which result in to-from to exceed the limit"); + from = std::min(std::max(from, min), max); + to = std::max(std::min(to, max), min); + }); + CHECK_EMPTY_AND_RETURN(self); + auto iter = at::TensorIterator::borrowing_nullary_op(self); + uniform_kernel()(iter, from, to, generator); + } + return self; +} + +// ================================================== LogNormal ======================================================= + +template class log_normal_kernel, typename RNG> +at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, std::optional gen) { + TORCH_CHECK(std > 0.0, "log_normal_ expects std > 0.0, but found std=", std); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + log_normal_kernel()(iter, mean, std, gen); + return self; +} + +// =================================================== Geometric ====================================================== + +template class geometric_kernel, typename RNG> +Tensor& geometric_impl_(Tensor& self, double p, std::optional gen) { + TORCH_CHECK(0 < p && p < 1, "geometric_ expects p to be in (0, 1), but got p=", p); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + geometric_kernel()(iter, p, gen); + return self; +} + +// ================================================== Exponential ===================================================== + +template class exponential_kernel, typename RNG> +Tensor& exponential_impl_(Tensor& self, double lambda, std::optional gen) { + TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + exponential_kernel()(iter, lambda, gen); + return self; +} + +// ==================================================== Cauchy ======================================================== + +template class cauchy_kernel, typename RNG> +Tensor& cauchy_impl_(Tensor& self, double median, double sigma, std::optional gen) { + // TODO: instead of variable name 'sigma', use 'gamma' or 'scale' + // the variance, squared sigma, is undefined for cauchy distribution + TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma); + TORCH_CHECK(at::isFloatingType(self.scalar_type()), "Cauchy distribution is a continuous probability distribution. dtype must be a floating point but you specified ", self.dtype()); + CHECK_EMPTY_AND_RETURN(self); + auto iter = TensorIterator::borrowing_nullary_op(self); + cauchy_kernel()(iter, median, sigma, gen); + return self; +} + +// ==================================================== Bernoulli ===================================================== + +template class bernoulli_tensor_kernel, typename RNG> +Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, std::optional gen) { + CHECK_EMPTY_AND_RETURN(self); + NoNamesGuard guard; + at::assert_no_internal_overlap(self); + bernoulli_tensor_kernel()(self, p_, gen); + return self; +} + +template class bernoulli_scalar_kernel, typename RNG> +Tensor& bernoulli_impl_(Tensor& self, double p, std::optional gen) { + TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p); + CHECK_EMPTY_AND_RETURN(self); + at::assert_no_internal_overlap(self); + bernoulli_scalar_kernel()(self, p, gen); + return self; +} + +template class bernoulli_tensor_kernel, typename RNG> +Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, std::optional gen) { + // result.resize_as_(self) requires self to have same dtype as result, so we + // use resize_ instead. + // TODO: Fix resize_as_. See pytorch/pytorch#11665. + result.resize_(self.sizes()); + bernoulli_impl_(result, self, gen); + namedinference::propagate_names(result, self); + return result; +} + +#undef CHECK_OUT_OF_BOUNDS +#undef WARN_OUT_OF_BOUNDS + +} // namespace at::native::templates + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Distributions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Distributions.h new file mode 100644 index 0000000000000000000000000000000000000000..81dfbd07129a8efa07986e99e68665ef67c18961 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Distributions.h @@ -0,0 +1,524 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +// ROCM hcc doesn't work well with using std:: in kernel functions +#if defined(__CUDA_ARCH__) +#include +#define compat_exp c10::cuda::compat::exp +#define compat_ceil c10::cuda::compat::ceil +#define compat_floor c10::cuda::compat::floor +#define compat_log c10::cuda::compat::log +#define compat_pow c10::cuda::compat::pow +#define compat_sqrt c10::cuda::compat::sqrt +#define compat_tan c10::cuda::compat::tan +#define compat_abs c10::cuda::compat::abs +#define compat_log1p c10::cuda::compat::log1p +#elif defined(__HIPCC__) +#include +#define compat_exp c10::hip::compat::exp +#define compat_ceil c10::hip::compat::ceil +#define compat_floor c10::hip::compat::floor +#define compat_log c10::hip::compat::log +#define compat_pow c10::hip::compat::pow +#define compat_sqrt c10::hip::compat::sqrt +#define compat_tan c10::hip::compat::tan +#define compat_abs c10::hip::compat::abs +#define compat_log1p c10::hip::compat::log1p +#else +#define compat_exp std::exp +#define compat_ceil std::ceil +#define compat_floor std::floor +#define compat_log std::log +#define compat_pow std::pow +#define compat_sqrt std::sqrt +#define compat_tan std::tan +#define compat_abs std::abs +#define compat_log1p std::log1p +#endif + +namespace { + +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) +// we cannot use std::isnan directly due to some incompatibility of +// gcc constexpr'ing and nvcc +using std::isnan; +#endif + +// Here sampler_t should be function type scalar_t(void). For gpu +// "sampler" is a device function, but since ROCM doesn't have +// equivalent to nvstd::function, we use a template type parameter to +// capture it. +template +struct BaseSampler { + sampler_t sampler; + C10_DEVICE BaseSampler(const sampler_t& sampler): sampler(sampler) {} + C10_DEVICE scalar_t sample() { + return sampler(); + } +}; + +// The function `sample_gamma` is +// is adapted from Numpy's distributions.c implementation. +// It is MIT licensed, so here is the copyright: + +/* Copyright 2005 Robert Kern (robert.kern@gmail.com) + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +template +C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler& standard_uniform, BaseSampler& standard_normal) { + accscalar_t scale = 1.0f; + + // Boost alpha for higher acceptance probability. + if (alpha < 1.0f) { + if (alpha == 0.f) return 0.f; + scale *= compat_pow(1 - standard_uniform.sample(), 1.0f / alpha); + alpha += 1.0f; + } + + // This implements the acceptance-rejection method of Marsaglia and Tsang (2000) + // doi:10.1145/358407.358414 + const accscalar_t d = alpha - 1.0f / 3.0f; + const accscalar_t c = 1.0f / compat_sqrt(9.0f * d); + for (;;) { + accscalar_t x, y; + do { + x = standard_normal.sample(); + y = 1.0f + c * x; + } while (y <= 0); + const accscalar_t v = y * y * y; + const accscalar_t u = 1 - standard_uniform.sample(); + const accscalar_t xx = x * x; + if (u < 1.0f - 0.0331f * xx * xx) + return static_cast(scale * d * v); + if (compat_log(u) < 0.5f * xx + d * (1.0f - v + compat_log(v))) + return static_cast(scale * d * v); + } +} + +/* the functions stirling_approx_tail, binomial_inversion, and btrs are adapted + * from TensorFlow's random_binomial_op.cc implementation. That code is under + * copyright: 2019 The TensorFlow Authors. + * + * It was released under the Apache License, Version 2.0 (the "License"), available at: + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +template +C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) { + constexpr static scalar_t kTailValues[] = { + 0.0810614667953272, + 0.0413406959554092, + 0.0276779256849983, + 0.02079067210376509, + 0.0166446911898211, + 0.0138761288230707, + 0.0118967099458917, + 0.0104112652619720, + 0.00925546218271273, + 0.00833056343336287 + }; + if (k < std::size(kTailValues)) { + return kTailValues[static_cast(k)]; + } + scalar_t kp1sq = (k + 1) * (k + 1); + return (1.0 / 12 - (1.0 / 360 - 1.0 / 1260 / kp1sq) / kp1sq) / (k + 1); +} + + +template +C10_DEVICE scalar_t binomial_inversion(scalar_t count, scalar_t prob, BaseSampler& standard_uniform) { + accscalar_t U; + accscalar_t geom_sum = 0; + scalar_t num_geom = 0; + + accscalar_t logprob = compat_log1p(-prob); + + while (true) { + U = standard_uniform.sample(); + accscalar_t geom = compat_ceil(compat_log(U) / logprob); + geom_sum += geom; + if (geom_sum > count) { + break; + } + num_geom = num_geom + 1; + } + return num_geom; +} + +template +C10_DEVICE scalar_t btrs(scalar_t count, scalar_t prob, BaseSampler& standard_uniform) { + scalar_t k; + accscalar_t U, V, us; + + // This is spq in the paper. + const accscalar_t stddev = compat_sqrt(count * prob * (1 - prob)); + + // Other coefficients for Transformed Rejection sampling. + const accscalar_t b = 1.15 + 2.53 * stddev; + const accscalar_t a = -0.0873 + 0.0248 * b + 0.01 * prob; + const accscalar_t c = count * prob + 0.5; + const accscalar_t v_r = 0.92 - 4.2 / b; + const accscalar_t r = prob / (1 - prob); + + const accscalar_t alpha = (2.83 + 5.1 / b) * stddev; + const accscalar_t m = compat_floor((count + 1) * prob); + + while (true) { + U = standard_uniform.sample() - 0.5; + V = standard_uniform.sample(); + + us = 0.5 - compat_abs(U); + k = static_cast(compat_floor((2 * a / us + b) * U + c)); + + // Reject non-sensical answers. + if (k < 0 || k > count) { + continue; + } + // Region for which the box is tight, and we can return our calculated value. + // This should happen 0.86 * v_r times. In the limit as n * p is large, + // the acceptance rate converges to ~79% (and in the lower regime it is ~24%). + if (us >= 0.07 && V <= v_r) { + return k; + } + + // This deviates from Hormann's BTRS algorithm, as there is a log missing. + // For all (u, v) pairs outside of the bounding box, this calculates the + // transformed-reject ratio. + V = compat_log(V * alpha / (a / (us * us) + b)); + accscalar_t upperbound = + ((m + 0.5) * compat_log((m + 1) / (r * (count - m + 1))) + + (count + 1) * compat_log((count - m + 1) / (count - k + 1)) + + (k + 0.5) * compat_log(r * (count - k + 1) / (k + 1)) + + stirling_approx_tail(m) + stirling_approx_tail(count - m) - + stirling_approx_tail(k) - stirling_approx_tail(count - k)); + + if (V <= upperbound) { + return k; + } + } +} + +template +C10_DEVICE scalar_t sample_binomial(scalar_t count, scalar_t prob, BaseSampler& standard_uniform) { + if (count <= 0.0 || prob <= 0.0) { + return 0; + } else if (prob >= 1.0) { + return count; + } else if (prob <= 0.5) { + if (count * prob >= 10.0) { + // btrs + return btrs(count, prob, standard_uniform); + } else { + // binomial inversion + return binomial_inversion(count, prob, standard_uniform); + } + } else if (prob > 0.5) { + scalar_t qprob = 1.0 - prob; + if (count * qprob >= 10.0) { + // btrs + return count - btrs(count, qprob, standard_uniform); + } else { + // count - binomial inversion + return count - binomial_inversion(count, qprob, standard_uniform); + } + } else { + // prob is nan? + return static_cast(NAN); + } +} + +/* + * This function is derived from the implementation of the digamma function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library] in ATen/native/Math.h. + */ +template +C10_DEVICE inline scalar_t digamma_one(scalar_t x) { + constexpr accscalar_t PSI_10 = 2.25175258906672110764; + if (x == 0) { + return INFINITY; + } + accscalar_t additional_summand = 0; + int x_is_integer = x == compat_floor(x); + if (x < 0) { + if (x_is_integer) { + return INFINITY; + } + // it is more standard to write this as recursion, but + // nvcc does not like that + additional_summand = -c10::pi / + compat_tan(c10::pi * x); + x = 1 - x; + } + + // Push x to be >= 10 + accscalar_t result = 0; + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return result + PSI_10 + additional_summand; + } + + // Compute asymptotic digamma + static const accscalar_t A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + accscalar_t y = 0; + if (x < 1.0e17f) { + accscalar_t z = 1.0 / (x * x); + y = z * polevl(z, A, 6); + } + return static_cast( + result + compat_log(x) - (0.5f / x) - y + additional_summand); +} + +// Computes the reparameterized gradient -(d/dalpha cdf(x;alpha)) / pdf(x;alpha) +// for random number x drawn from a standard Gamma distribution Gamma(alpha). +template +C10_HOST_DEVICE scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) { + // Use a Taylor series expansion for small x. + accscalar_t x = static_cast(x_); + accscalar_t alpha = static_cast(alpha_); + if (x < 0.8f) { + accscalar_t numer = 1; + accscalar_t denom = alpha; + auto series1 = numer / denom; + auto series2 = numer / (denom * denom); + for (int i = 1; i <= 5; ++i) { + numer *= -x / static_cast(i); + denom += 1; + series1 += numer / denom; + series2 += numer / (denom * denom); + } + const auto pow_x_alpha = compat_pow(x, alpha); + const auto gamma_pdf = compat_pow(x, alpha - 1) * compat_exp(-x); + const auto gamma_cdf = pow_x_alpha * series1; + const auto gamma_cdf_alpha = + (compat_log(x) - digamma_one(alpha)) * + gamma_cdf - + pow_x_alpha * series2; + const auto result = -gamma_cdf_alpha / gamma_pdf; + return isnan(result) ? static_cast( 0.f ) : static_cast(result); + } + + // Use a Rice saddle point expansion for large alpha. + if (alpha > 8.0f) { + if (0.9f * alpha <= x && x <= 1.1f * alpha) { + const auto numer_1 = 1 + 24 * alpha * (1 + 12 * alpha); + const auto numer_2 = 1440 * (alpha * alpha) + 6 * x * (53 - 120 * x) + - 65 * x * x / alpha + alpha * (107 + 3600 * x); + const auto denom = 1244160 * (alpha * alpha) * (alpha * alpha); + return static_cast(numer_1 * numer_2 / denom); + } + const auto denom = compat_sqrt(8 * alpha); + const auto term2 = denom / (alpha - x); + const auto term3 = compat_pow( + x - alpha - alpha * compat_log(x / alpha), + static_cast(-1.5)); + const auto term23 = (x < alpha) ? term2 - term3 : term2 + term3; + const auto term1 = compat_log(x / alpha) * term23 - + compat_sqrt(2 / alpha) * (alpha + x) / ((alpha - x) * (alpha - x)); + const auto stirling = 1 + 1 / (12 * alpha) * (1 + 1 / (24 * alpha)); + const auto numer = x * term1; + return static_cast(-stirling * numer / denom); + } + + // Use a bivariate rational approximation to the reparameterized gradient. + const auto u = compat_log(x / alpha); + const auto v = compat_log(alpha); + static const accscalar_t coef_uv[3][8] = { + {0.16009398, -0.094634809, 0.025146376, -0.0030648343, + 1, 0.32668115, 0.10406089, 0.0014179084}, + {0.53487893, 0.1298071, 0.065735949, -0.0015649758, + 0.16639465, 0.020070113, -0.0035938915, -0.00058392623}, + {0.040121004, -0.0065914022, -0.0026286047, -0.0013441777, + 0.017050642, -0.0021309326, 0.00085092367, -1.5247877e-07}, + }; + accscalar_t coef_v[8]; + for (int i = 0; i < 8; ++ i) { + coef_v[i] = coef_uv[0][i] + u * (coef_uv[1][i] + u * coef_uv[2][i]); + } + const auto p = coef_v[0] + v * (coef_v[1] + v * (coef_v[2] + v * coef_v[3])); + const auto q = coef_v[4] + v * (coef_v[5] + v * (coef_v[6] + v * coef_v[7])); + return static_cast(compat_exp(p / q)); +} + +// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha. +// Assumes x is close to zero and uses a Taylor expansion. +template +C10_DEVICE inline scalar_t _beta_grad_alpha_small(scalar_t x, scalar_t alpha, scalar_t beta) { + const scalar_t factor = digamma_one(alpha) + - digamma_one(alpha + beta) - compat_log(x); + scalar_t numer = 1; + scalar_t series = numer / alpha * (factor + 1 / alpha); + for (int i = 1; i <= 10; ++i) { + scalar_t casted_i = static_cast(i); + numer *= (casted_i - beta) * x / casted_i; + const scalar_t denom = alpha + casted_i; + series += numer / denom * (factor + 1 / denom); + } + const scalar_t result = x * compat_pow(1 - x, -beta) * series; + return isnan(result) ? static_cast( 0.f ) : result; +} + +// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt beta. +// Assumes x is close to zero and uses a Taylor expansion. +template +C10_DEVICE inline scalar_t _beta_grad_beta_small(scalar_t x, scalar_t alpha, scalar_t beta) { + const scalar_t factor = digamma_one(alpha + beta) - digamma_one(beta); + scalar_t numer = 1, betas = 1, dbetas = 0, series = factor / alpha; + for (int i = 1; i <= 8; ++i) { + scalar_t casted_i = static_cast(i); + numer *= -x / casted_i; + dbetas = dbetas * (beta - casted_i) + betas; + betas = betas * (beta - casted_i); + series += numer / (alpha + casted_i) * (dbetas + factor * betas); + } + const scalar_t result = -compat_pow(1 - x, 1 - beta) * series; + return isnan(result) ? static_cast( 0.f ) : result; +} + +// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha. +// Assumes alpha and beta are both large and uses a Rice saddle point expansion. +// To ensure numerical stability, this computation is performed at higher precision. +template +C10_DEVICE inline scalar_t _beta_grad_alpha_mid(accscalar_t x, accscalar_t alpha, accscalar_t beta) { + const accscalar_t total = alpha + beta; + const accscalar_t mean = alpha / total; + const accscalar_t std = compat_sqrt(alpha * beta / (total + 1)) / total; + if (mean - 0.1 * std <= x && x <= mean + 0.1 * std) { + // Avoid the singularity at x = mean. + const accscalar_t poly = 47 * x * (beta * beta) * (beta * beta) + alpha * ( + (43 + 20 * (16 + 27 * beta) * x) * (beta * beta) * beta + alpha * ( + 3 * (59 + 180 * beta - 90 * x) * (beta * beta) + alpha * ( + (453 + 1620 * beta * (1 - x) - 455 * x) * beta + alpha * ( + 8 * (1 - x) * (135 * beta - 11))))); + const accscalar_t prefactor_num = (1 + 12 * alpha) * (1 + 12 * beta) / (total * total); + const accscalar_t prefactor_den = 12960 * alpha * alpha * alpha * beta * beta * (1 + 12 * total); + return prefactor_num / (1 - x) * poly / prefactor_den; + } + const accscalar_t prefactor = -x / compat_sqrt(2 * alpha * beta / total); + const accscalar_t stirling = (1 + 1 / (12 * alpha) + 1 / (288 * alpha * alpha)) + * (1 + 1 / (12 * beta) + 1 / (288 * beta * beta)) + / (1 + 1 / (12 * total) + 1 / (288 * total * total)); + const accscalar_t term1_num = 2 * (alpha * alpha) * (x - 1) + alpha * beta * (x - 1) - x * (beta * beta); + const accscalar_t axbx = alpha * (x - 1) + beta * x; + const accscalar_t term1_den = compat_sqrt(2 * alpha / beta) * compat_pow(total, static_cast(1.5f)) * axbx * axbx; + const accscalar_t term1 = term1_num / term1_den; + const accscalar_t term2 = 0.5f * compat_log(alpha / (total * x)); + const accscalar_t term3_num = compat_sqrt(8 * alpha * beta / total); + const accscalar_t term3_den = beta * x + alpha * (x - 1); + const accscalar_t term3 = term3_num / term3_den; + const accscalar_t term4_base = beta * compat_log(beta / (total * (1 - x))) + + alpha * compat_log(alpha / (total * x)); + const accscalar_t term4 = compat_pow(term4_base, static_cast(-1.5f)); + const accscalar_t term1234 = term1 + term2 * (term3 + (x < mean ? term4 : -term4)); + return static_cast(stirling * prefactor * term1234); +} + +// Computes a scaled reparameterized gradient +// -(d/dalpha cdf(x;alpha,beta)) / pdf(x;alpha,beta) / (1-x) +// for random number x drawn from a Beta distribution Beta(alpha,beta). +// This function inputs total=alpha+beta to make it easy to implement +// Dirichlet reparameterized gradients in terms of Betas. +template +C10_HOST_DEVICE inline scalar_t dirichlet_grad_one(scalar_t x, scalar_t alpha, scalar_t total) { + accscalar_t x_ = static_cast(x); + accscalar_t alpha_ = static_cast(alpha); + accscalar_t total_ = static_cast(total); + + const scalar_t beta = total - alpha; + const accscalar_t beta_ = total_ - alpha_; + const scalar_t boundary = total * x * (1 - x); + + // Use an asymptotic approximation for x close to 0. + if (x <= 0.5f && boundary < 2.5f) { + return _beta_grad_alpha_small(x, alpha, beta); + } + + // Use an asymptotic approximation for x close to 1. + if (x >= 0.5f && boundary < 0.75f) { + return -_beta_grad_beta_small(1 - x, beta, alpha); + } + + // Use an asymptotic approximation when alpha and (total - alpha) are both large. + if (alpha > 6 && beta > 6) { + return _beta_grad_alpha_mid(x_, alpha_, beta_); + } + + // Use a rational correction to an analytic approximation. + static const accscalar_t c[2][3][3][4] = { + {{{1.003668233, -0.01061107488, -0.0657888334, 0.01201642863}, + {0.6336835991, -0.3557432599, 0.05486251648, -0.001465281033}, + {-0.03276231906, 0.004474107445, 0.002429354597, -0.0001557569013}}, + {{0.221950385, -0.3187676331, 0.01799915743, 0.01074823814}, + {-0.2951249643, 0.06219954479, 0.01535556598, 0.001550077057}, + {0.02155310298, 0.004170831599, 0.001292462449, 6.976601077e-05}}, + {{-0.05980841433, 0.008441916499, 0.01085618172, 0.002319392565}, + {0.02911413504, 0.01400243777, -0.002721828457, 0.000751041181}, + {0.005900514878, -0.001936558688, -9.495446725e-06, 5.385558597e-05}}}, + {{{1, -0.02924021934, -0.04438342661, 0.007285809825}, + {0.6357567472, -0.3473456711, 0.05454656494, -0.002407477521}, + {-0.03301322327, 0.004845219414, 0.00231480583, -0.0002307248149}}, + {{0.5925320577, -0.1757678135, 0.01505928619, 0.000564515273}, + {0.1014815858, -0.06589186703, 0.01272886114, -0.0007316646956}, + {-0.007258481865, 0.001096195486, 0.0003934994223, -4.12701925e-05}}, + {{0.06469649321, -0.0236701437, 0.002902096474, -5.896963079e-05}, + {0.001925008108, -0.002869809258, 0.0008000589141, -6.063713228e-05}, + {-0.0003477407336, 6.959756487e-05, 1.097287507e-05, -1.650964693e-06}}}, + }; + const accscalar_t u = compat_log(x_); + const accscalar_t a = compat_log(alpha_) - u; + const accscalar_t b = compat_log(total_) - a; + const accscalar_t pow_u[3] = {1, u, u * u}; + const accscalar_t pow_a[3] = {1, a, a * a}; + accscalar_t p = 0.0; + accscalar_t q = 0.0; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + const accscalar_t ua = pow_u[i] * pow_a[j]; + p += ua * (c[0][i][j][0] + b * (c[0][i][j][1] + b * (c[0][i][j][2] + b * c[0][i][j][3]))); + q += ua * (c[1][i][j][0] + b * (c[1][i][j][1] + b * (c[1][i][j][2] + b * c[1][i][j][3]))); + } + } + const accscalar_t approx = x_ * (digamma_one(total_) - digamma_one(alpha_)) / beta_; + return static_cast(p / q * approx); +} + +} // namespace + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/EmbeddingBag.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/EmbeddingBag.h new file mode 100644 index 0000000000000000000000000000000000000000..9d7569f13a7b62ed86f2d055158712258edc98fe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/EmbeddingBag.h @@ -0,0 +1,159 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + +#ifdef USE_FBGEMM +#include +#endif + +namespace at::native { + +enum class EmbeddingBagMode { + SUM = 0, + MEAN = 1, + MAX = 2, +}; + +[[maybe_unused]] static bool operator==(int64_t op1, EmbeddingBagMode op2) { + return op1 == static_cast(op2); +} + +[[maybe_unused]] static bool operator!=(int64_t op1, EmbeddingBagMode op2) { + return !(op1 == op2); +} + +void check_arguments( + const Tensor& weight, + const Tensor& indices, + const Tensor& offsets, + const int64_t mode, + const std::optional& per_sample_weights, + bool include_last_offset); + +void make_bag_size_out( + Tensor& bag_size_out, + const Tensor& offsets, + const Tensor& indices, + const int64_t mode, + const bool include_last_offset, + const bool requires_grad); + +void make_max_indices_out( + Tensor& max_indices_out, + const Tensor& weight, + const Tensor& indices, + const Tensor& offsets, + const Tensor& bag_size, + const int64_t mode, + bool include_last_offset); + +void make_offset2bag_out( + Tensor& offset2bag, + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const Tensor& offsets, + const int64_t mode, + const std::optional& per_sample_weights, + const int64_t padding_idx = -1); + +#ifdef USE_FBGEMM + +template +struct _CallbackAndBlockSize { + using TCallback = typename fbgemm::EmbeddingSpMDMKernelSignature::Type; + + int64_t blockSize = -1; + TCallback callback = nullptr; + + static TCallback generateCallback(int64_t block_size) { + return fbgemm::GenerateEmbeddingSpMDM( + block_size, + has_weight, + /* normalize_by_lengths */false, + /* prefetch */16, + /* is_weight_positional */false, + /* use_offsets */true); + } + + _CallbackAndBlockSize() = default; + + explicit _CallbackAndBlockSize(std::optional maybe_block_size) + : blockSize(maybe_block_size.value_or(-1)) + , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr) + {} +}; + +template +struct _EmbeddingBagKernelCacheImpl : private StorageMixins... { + + _EmbeddingBagKernelCacheImpl() = default; + // use each of the mixins to store corresponding kernel and block size + explicit _EmbeddingBagKernelCacheImpl(std::optional maybe_block_size) + : StorageMixins(maybe_block_size)... + {} + + // this method is thread safe (call sites may call from different threads) + template + typename _CallbackAndBlockSize::TCallback + getCallback(int64_t block_size) const { + // if the cache doesn't store the kernel for the incoming block size + // (so it is different from the one stored in corresponding mixin) + // regenerate the kernel (not writing it into the cache so we avoid locks) + if (block_size != _CallbackAndBlockSize::blockSize) { + return _CallbackAndBlockSize::generateCallback(block_size); + } + // else retrieve the cached kernel from the corresponding mixin + return _CallbackAndBlockSize::callback; + } +}; + +// instantiate the cache with the list of storage mixins +// for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file +using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl< + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize>; +#else +struct _EmbeddingBagKernelCache { + explicit _EmbeddingBagKernelCache(std::optional /* maybe_block_size */) {} +}; +#endif + +void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag, + Tensor& bag_size, Tensor* max_indices, + const Tensor &weight, const Tensor &indices, + const Tensor &offsets, const int64_t mode = 0, + const std::optional& per_sample_weights = std::nullopt, + bool include_last_offset = false, + int64_t padding_idx = -1, + _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); + +void _embedding_bag_cpu_out( + at::Tensor& output, + at::Tensor& offset2bag, + at::Tensor& bag_size, + at::Tensor* p_max_indices, + const at::Tensor& weight, + const at::Tensor& indices, + const at::Tensor& offsets, + const bool scale_grad_by_freq, + const int64_t mode, + const bool sparse, + const std::optional& per_sample_weights, + const bool include_last_offset, + const std::optional& padding_idx, + _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Fill.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Fill.h new file mode 100644 index 0000000000000000000000000000000000000000..e4809d59a3e560a6fa4d4f0613bf6bbfffdf7c86 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Fill.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Functions that fill Tensors with constants. Implementations are in Fill.cpp. + +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { +class Tensor; +struct TensorIterator; + +namespace native { + +DECLARE_DISPATCH(void(*)(TensorIterator&, const c10::Scalar&), fill_stub) + +Tensor& fill_out(Tensor& self, const Scalar& value); + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ForeachUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ForeachUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..eefc95259f662496b08fc0109d0d6c5e007521f6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ForeachUtils.h @@ -0,0 +1,385 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include +#include + +namespace at::native { +namespace { +// Check if tensor list has either a boolean tensor or a integer tensor +inline bool has_integral_tensor(TensorList tensors, const bool includeBool) { + return std::any_of( + tensors.begin(), tensors.end(), [includeBool](const auto& t) { + return at::isIntegralType(t.scalar_type(), includeBool); + }); +} +// check if tensor list has bool tensors +inline bool has_bool_tensor(TensorList tensors) { + return std::any_of(tensors.begin(), tensors.end(), [](const auto& t) -> bool { + return t.scalar_type() == ScalarType::Bool; + }); +} + +// Check foreach API restrictions +// - Tensor lists must be non-empty. +// - All TensorLists and ScalarLists must have the same number of elements. +// - Corresponding tensors must have the same size. +inline void check_foreach_api_restrictions(TensorList tensors) { + TORCH_CHECK(!tensors.empty(), "Tensor list must have at least one tensor."); +} + +inline void check_foreach_api_restrictions( + TensorList tensors, + ArrayRef scalars) { + check_foreach_api_restrictions(tensors); + TORCH_CHECK( + tensors.size() == scalars.size(), + "Tensor list must have same number of elements as scalar list."); +} + +inline void check_foreach_api_restrictions( + TensorList tensors1, + TensorList tensors2) { + check_foreach_api_restrictions(tensors1); + check_foreach_api_restrictions(tensors2); + TORCH_CHECK( + tensors1.size() == tensors2.size(), + "Tensor lists must have the same number of tensors, got ", + tensors1.size(), + " and ", + tensors2.size()); +} + +inline void check_foreach_api_restrictions( + TensorList tensors1, + TensorList tensors2, + TensorList tensors3) { + check_foreach_api_restrictions(tensors1, tensors2); + check_foreach_api_restrictions(tensors1, tensors3); +} + +inline void check_foreach_api_restrictions( + TensorList tensors1, + TensorList tensors2, + TensorList tensors3, + ArrayRef scalars) { + check_foreach_api_restrictions(tensors1, tensors2, tensors3); + check_foreach_api_restrictions(tensors1, scalars); +} + +inline void check_foreach_api_restrictions( + TensorList tensors1, + TensorList tensors2, + ArrayRef scalars) { + check_foreach_api_restrictions(tensors1, tensors2); + check_foreach_api_restrictions(tensors1, scalars); +} + +// Helper function called in check_fast_path_restrictions to check whether all +// corresponding tensors (aligning in index across the tensorLists) share the +// same device and dtype. +inline bool _check_tensors_share_device_and_dtype( + ArrayRef tensorLists, + const bool skip_dtype_check = false) { + const auto expected_dtype = tensorLists[0][0].dtype(); + const auto expected_device = tensorLists[0][0].device(); + + auto is_tensor_okay = [&](const Tensor& tensor) { + return (skip_dtype_check || tensor.dtype() == expected_dtype) && + tensor.device() == expected_device && tensor.layout() == at::kStrided && + tensor.is_non_overlapping_and_dense(); + }; + + return std::all_of( + tensorLists.cbegin(), + tensorLists.cend(), + [&](const TensorList& tensorList) { + return std::all_of( + tensorList.cbegin(), tensorList.cend(), is_tensor_okay); + }); +} + +// Helper function called in check_fast_path_restrictions to check if +// corresponding tensors in tensor lists have the same sizes and strides. +inline bool _check_tensors_share_sizes_and_strides( + ArrayRef tensorLists) { + auto is_diff_stride = [](const IntArrayRef& size, + const IntArrayRef& left_stride, + const IntArrayRef& right_stride) -> bool { + const size_t size_size = size.size(); + for (const auto dim : c10::irange(size_size)) { + if (size[dim] == 1) + continue; + if (left_stride[dim] != right_stride[dim]) { + return true; + } + } + return false; + }; + for (const auto i : c10::irange(1, tensorLists.size())) { + for (const auto j : c10::irange(tensorLists[0].size())) { + if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() || + is_diff_stride( + tensorLists[0][j].sizes(), + tensorLists[0][j].strides(), + tensorLists[i][j].strides())) { + return false; + } + } + } + + return true; +} + +// Helper function called in check_fast_path_restrictions to check whether +// all tensors type promote properly with the scalars in scalarList. This +// function assumes that _check_tensors_share_device_and_dtype has already been +// called so that all corresponding tensors in tensorLists have the same dtype. +// Then, it is sufficient to check the type promotion with just one tensorList. +inline bool _check_tensors_do_type_promotion_with_scalars( + TensorList tensorList, + ArrayRef scalarList = {}, + bool does_op_promote_integer_inputs_to_float = false) { + for (const auto i : c10::irange(tensorList.size())) { + // For division, integer inputs will result in float. + if (does_op_promote_integer_inputs_to_float && + at::isIntegralType(tensorList[i].scalar_type(), /*includeBool*/ true)) { + return false; + } + if (!scalarList.empty()) { + const auto& scalar = + scalarList.size() == 1 ? scalarList[0] : scalarList[i]; + const auto& tensor = tensorList[i]; + // note(mkozuki): This check might be responsible for + // `_foreach_add(bool_tensors, bool_tensors)` being pushed to slow path. + if (tensor.scalar_type() != at::native::result_type(scalar, tensor)) { + return false; + } + } + } + + return true; +} + +// To go via 'fast' path, several conditions must be satisfied +// - All tensors in all lists must have the same dtype. +// - All tensors must be on the same device +// - All tensors must have strided layout +// - All tensors must be non-overlapping and dense +// - Resulting tensor must have the same dtype as the input one + +// [note: what's ``does_op_promote_integer_inputs_to_float=true``?] +// ``does_op_promote_integer_inputs_to_float=true`` means that the result of +// the op will be float even if inputs are integer or boolean, which +// currently fast path does not support. In short, this flag, when +// turned on, gatekeeps the op from going down the fastpath. + +// Please, make sure to call check_foreach_api_restrictions before calling this +// method. There is a set of preconditions that have to be satisfied. +inline bool check_fast_path_restrictions( + ArrayRef tensorLists, + ArrayRef scalarList = {}, + bool does_op_promote_integer_inputs_to_float = false) { + return _check_tensors_share_device_and_dtype(tensorLists) && + _check_tensors_share_sizes_and_strides(tensorLists) && + _check_tensors_do_type_promotion_with_scalars( + tensorLists[0], + scalarList, + does_op_promote_integer_inputs_to_float); +} + +inline std::vector convert_tensor_to_scalar_list( + const Tensor& scalarList_, + int64_t expect_length) { + std::vector scalarList; + TORCH_CHECK( + scalarList_.device() == c10::kCPU, + "Expected scalars to be on CPU, got ", + scalarList_.device(), + " instead."); + TORCH_CHECK( + scalarList_.is_contiguous(), "Expected scalars to be contiguous."); + TORCH_CHECK( + scalarList_.dim() == 1, + "Expected packed scalar Tensor to be of dimension 1. Got ", + scalarList_.dim(), + " instead."); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( + kComplexHalf, + kHalf, + kBool, + kBFloat16, + scalarList_.scalar_type(), + "convert_tensor_to_scalar_list", + [&]() { + const scalar_t* scalar_data = scalarList_.const_data_ptr(); + TORCH_CHECK( + (expect_length == scalarList_.size(0)), + "Expected length of scalars to match input of length ", + expect_length, + " but got ", + scalarList_.size(0), + " instead."); + for (int64_t i = 0; i < scalarList_.size(0); i++) { + scalarList.emplace_back(scalar_data[i]); + } + }); + return scalarList; +} + +// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?] +inline bool can_use_fast_route( + ArrayRef tensorLists, + ArrayRef scalarList = {}, + bool does_op_promote_integer_inputs_to_float = false) { + return check_fast_path_restrictions( + tensorLists, scalarList, does_op_promote_integer_inputs_to_float); +} + +// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?] +inline bool can_use_fast_route( + TensorList tensors1, + TensorList tensors2, + bool does_op_promote_integer_inputs_to_float = false) { + return can_use_fast_route( + {tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float); +} + +using DeviceDtypeKey = std::pair; +using IndicesT = std::vector; +using nested_optional_tensorvec_t = + std::vector>>; +using TensorsAndIndicesT = std::pair; +using FlatMap = std::unordered_map< + DeviceDtypeKey, + TensorsAndIndicesT, + ParamsHash>; + +inline FlatMap _group_tensors_by_first_tensors_device_and_dtype( + const nested_optional_tensorvec_t& nested_tensorlist, + const bool with_indices) { + FlatMap grouped_tensors_with_indices; + + TORCH_CHECK(!nested_tensorlist.empty()); + TORCH_CHECK(!nested_tensorlist[0].empty()); + const auto num_lists = nested_tensorlist.size(); + const auto num_tensors = nested_tensorlist[0].size(); + + TORCH_CHECK(std::all_of( + nested_tensorlist.cbegin(), + nested_tensorlist.cend(), + [&](const auto& tensorlist) -> bool { + // note(crcrpar): Allow empty tensorlists following + // ref: + // https://github.com/pytorch/pytorch/blob/85885301fd3c6adb8b9dc3cf7afadf6945566684/torch/utils/_foreach_utils.py#L21-L24 + return tensorlist.size() == num_tensors || tensorlist.size() == 0; + })); + + for (const auto& tensor_index : c10::irange(num_tensors)) { + const auto key = [&]() -> DeviceDtypeKey { + const auto t = nested_tensorlist[0][tensor_index]; + TORCH_CHECK( + t.has_value(), + "Tensors of the first list of nested Tensor lists are supposed to be defined but ", + "the ", + tensor_index, + "-th Tensor is not."); + return {t->device(), t->scalar_type()}; + }(); + TORCH_CHECK( + std::all_of( + nested_tensorlist.cbegin(), + nested_tensorlist.cend(), + [&](const auto& tensorlist) -> bool { + if (tensorlist.size() == 0) { + return true; + } + const auto& tensor = tensorlist[tensor_index]; + // note(crcrpar): Currently the scope of this function is + // optimizers so there could be `state_steps` and other scalars + // whose elements are float tensors no matter what the parameter's + // dtype is. + if (!tensor.has_value()) { + return true; + } else { + const auto s = tensor->scalar_type(); + const auto d = tensor->device(); + // Note: `step` or `state_step` is float32 by default. + if (key.first == d) { + return key.second == s || s == at::ScalarType::Float || + s == at::ScalarType::Double; + } else if (d.is_cpu()) { + // note(crcrpar): There are some test cases (e.g. + // TestOptim::test_adam) where state_steps are on CPU and the + // others are on CUDA. Currently a state_step Tensor has the + // dtype of float. + return s == at::ScalarType::Float || + s == at::ScalarType::Double; + } else { + return false; + } + } + }), + "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding"); + grouped_tensors_with_indices.try_emplace( + key, + TensorsAndIndicesT{ + [&]() -> nested_optional_tensorvec_t { + nested_optional_tensorvec_t nested_tensorvec; + nested_tensorvec.reserve(num_lists); + for (const auto& i : c10::irange(num_lists)) { + std::vector> tensors; + if (!nested_tensorlist[i].empty()) { + // NB: num_tensors is the max possible length for any of + // the inner lists of tensor references. Reserving the max + // trades memory for perf. This should not have significant + // impact. + tensors.reserve(num_tensors); + } + nested_tensorvec.emplace_back(std::move(tensors)); + } + return nested_tensorvec; + }(), + [&]() -> IndicesT { + if (!with_indices) { + return {}; + } else { + IndicesT indices; + indices.reserve(num_tensors); + return indices; + } + }()}); + for (const auto& list_index : c10::irange(num_lists)) { + if (!nested_tensorlist[list_index].empty()) { + grouped_tensors_with_indices[key].first[list_index].emplace_back( + nested_tensorlist[list_index][tensor_index]); + } + } + if (with_indices) { + grouped_tensors_with_indices[key].second.emplace_back(tensor_index); + } + } + + return grouped_tensors_with_indices; +} + +} // namespace +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FractionalMaxPooling.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FractionalMaxPooling.h new file mode 100644 index 0000000000000000000000000000000000000000..c52c41a314400cf3db633ff7f1f1a0827a724a68 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FractionalMaxPooling.h @@ -0,0 +1,85 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + +namespace at::native { + +template +inline std::vector generate_intervals( + scalar_t sample, + int64_t inputSize, + int64_t outputSize, + int64_t poolSize) { + std::vector sequence(outputSize); + if (outputSize > 1) { + scalar_t alpha = static_cast(inputSize - poolSize) / + static_cast(outputSize - 1); + + for (const auto i : c10::irange(outputSize - 1)) { + sequence[i] = + static_cast((i + sample) * alpha) - static_cast(sample * alpha); + } + } + if (outputSize > 0) { + sequence[outputSize - 1] = inputSize - poolSize; + } + return sequence; +} + +template +inline void fractional_max_pool_check_shape( + const Tensor& input, + const Tensor& randomSamples) { + + TORCH_CHECK( + input.scalar_type() == randomSamples.scalar_type(), + "Expect _random_samples to have the same dtype as input"); + + int64_t ndimension = randomSamples.ndimension(); + TORCH_CHECK( + ndimension == 3, + "Expect _random_samples to have 3 dimensions, got ", ndimension); + + int64_t N = randomSamples.size(0); + int64_t C = randomSamples.size(1); + int64_t D = randomSamples.size(2); + + int64_t input_batch = 0, input_channel = 0; + if (ndim == 2) { + // fractional_max_pool2d + if (input.ndimension() == 3) { + input_batch = 1; + input_channel = input.size(0); + } else { + input_batch = input.size(0); + input_channel = input.size(1); + } + } else { + // factional_max_pool3d + if (input.ndimension() == 4) { + input_batch = 1; + input_channel = input.size(0); + } else { + input_batch = input.size(0); + input_channel = input.size(1); + } + } + + TORCH_CHECK( + N >= input_batch, + "Expect _random_samples.size(0) no less then input batch size."); + TORCH_CHECK( + C == input_channel, + "Expect _random_samples.size(1) equals to input channel size."); + TORCH_CHECK( + D == ndim, + "Expect _random_samples.size(2) equals to ", ndim, "; got ", D, "."); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..a6d26c2e1d0286851018077475a9fead83df528f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at { +struct TensorIterator; + +namespace native { + +using _compute_linear_combination_fn = void(*)( + TensorIterator& iter, + int64_t in_stride, + int64_t coeff_stride, + int64_t num_summations +); + +DECLARE_DISPATCH(_compute_linear_combination_fn, _compute_linear_combination_stub) + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FusedAdagrad.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FusedAdagrad.h new file mode 100644 index 0000000000000000000000000000000000000000..a43863b5b0dca62c5c648dc086d698a24b4a91d0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FusedAdagrad.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include + +namespace at::native { + +using fused_adagrad_fn = void (*)( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& state_sum, + const at::Tensor& state_step, + const double lr, + const double lr_decay, + const double weight_decay, + const double eps, + const bool maximize, + const float* grad_scale_ptr); + +DECLARE_DISPATCH(fused_adagrad_fn, fused_adagrad_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FusedAdam.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FusedAdam.h new file mode 100644 index 0000000000000000000000000000000000000000..0ec3c0e854270d9609bc6eae5e471662db724d10 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FusedAdam.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include + +namespace at::native { + +enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 }; + +using fused_adam_fn = void (*)( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& exp_avg, + const at::Tensor& exp_avg_sq, + const at::Tensor& max_exp_avg_sq, + const at::Tensor& state_step, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool amsgrad, + const bool maximize, + const float* grad_scale_ptr, + const ADAM_MODE); + +DECLARE_DISPATCH(fused_adam_fn, fused_adam_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FusedSGD.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FusedSGD.h new file mode 100644 index 0000000000000000000000000000000000000000..ed97d3c19054c62b65d7a0cd31e8eb1f3cfa61e1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/FusedSGD.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include + +namespace at::native { + +using fused_sgd_fn = void (*)( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& momentum_buffer, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const float* grad_scale_ptr); + +DECLARE_DISPATCH(fused_sgd_fn, fused_sgd_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Gelu.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Gelu.h new file mode 100644 index 0000000000000000000000000000000000000000..e2bf52c98dd06d8764fb9dccd99a8843cb3bcd5f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Gelu.h @@ -0,0 +1,38 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { +// These constants control the approximation behavior of gelu function. +enum class GeluType { + None, // Baseline Gelu + Tanh, // Tanh Gelu Approximation + END +}; + +inline GeluType get_gelutype_enum(const std::string_view approximate) { + if (approximate == "none") { + return GeluType::None; + } else if (approximate == "tanh") { + return GeluType::Tanh; + } else { + TORCH_CHECK(false, "approximate argument must be either none or tanh."); + } +} + +inline std::string gelutype_to_string(const GeluType type) { + switch(type) { + case GeluType::None: return "none"; + case GeluType::Tanh: return "tanh"; + default: TORCH_CHECK(false, "unknown GELU type: ", static_cast(type)); + } +} + + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/GridSampler.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/GridSampler.h new file mode 100644 index 0000000000000000000000000000000000000000..bc561018ec5944b7cc9d118aeb220eee8e896591 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/GridSampler.h @@ -0,0 +1,303 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include + +namespace at::native { + +using detail::GridSamplerInterpolation; +using detail::GridSamplerPadding; + +// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value, +// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5). +// if align_corners: -1 and +1 get sent to the centers of the corner pixels +// -1 --> 0 +// +1 --> (size - 1) +// scale_factor = (size - 1) / 2 +// if not align_corners: -1 and +1 get sent to the image edges +// -1 --> -0.5 +// +1 --> (size - 1) + 0.5 == size - 0.5 +// scale_factor = size / 2 +template +static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, + bool align_corners) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1) * size - 1) / 2; + } +} + +// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize +// except that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static inline scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int64_t size, + bool align_corners, scalar_t *grad_in) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + *grad_in = static_cast(size - 1) / 2; + return ((coord + 1) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + *grad_in = static_cast(size) / 2; + return ((coord + 1) * size - 1) / 2; + } +} + +// Clips coordinates to between 0 and clip_limit - 1 +template +static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) { + return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); +} + +// clip_coordinates_set_grad works similarly to clip_coordinates except that +// it also returns the `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static inline scalar_t clip_coordinates_set_grad(scalar_t in, int64_t clip_limit, + scalar_t *grad_in) { + // Note that it is important for the gradient calculation that borders + // are considered out of bounds. + if (in <= static_cast(0)) { + *grad_in = static_cast(0); + return static_cast(0); + } else { + scalar_t max = static_cast(clip_limit - 1); + if (in >= max) { + *grad_in = static_cast(0); + return max; + } else { + *grad_in = static_cast(1); + return in; + } + } +} + +// Reflects coordinates until they fall between low and high (inclusive). +// The bounds are passed as twice their value so that half-integer values +// can be represented as ints. +template +static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low, + int64_t twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = std::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + scalar_t extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) { + return extra + min; + } else { + return span - extra + min; + } +} + +// reflect_coordinates_set_grad works similarly to reflect_coordinates except +// that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static inline scalar_t reflect_coordinates_set_grad(scalar_t in, int64_t twice_low, + int64_t twice_high, scalar_t *grad_in) { + if (twice_low == twice_high) { + *grad_in = static_cast(0); + return static_cast(0); + } + int grad_in_mult_; + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = in - min; + if (in < static_cast(0)) { + grad_in_mult_ = -1; + in = -in; + } else { + grad_in_mult_ = 1; + } + // `fmod` returns same sign as `in`, which is positive after the `if` above. + scalar_t extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) { + *grad_in = static_cast(grad_in_mult_); + return extra + min; + } else { + *grad_in = static_cast(-grad_in_mult_); + return span - extra + min; + } +} + +// Mapping the out-of-boundary points back into boundary +// This would only affect padding_mode=border or reflection +template +static inline scalar_t compute_coordinates(scalar_t coord, int64_t size, + GridSamplerPadding padding_mode, + bool align_corners) { + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates(coord, 0, 2*(size - 1)); + } else { + coord = reflect_coordinates(coord, -1, 2*size - 1); + } + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } + return coord; +} + +// Computes the pixel source index value for a grid coordinate +template +static inline scalar_t grid_sampler_compute_source_index( + scalar_t coord, + int64_t size, + GridSamplerPadding padding_mode, + bool align_corners) { + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; +} + +// grid_sampler_compute_source_index_set_grad works similarly to +// grid_sampler_compute_source_index except that it also returns the +// `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static inline scalar_t grid_sampler_compute_source_index_set_grad( + scalar_t coord, + int64_t size, + GridSamplerPadding padding_mode, + bool align_corners, + scalar_t *grad_in) { + scalar_t grad_clip, grad_refl; + coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in); + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_clip; + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl); + } else { + coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl); + } + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_refl * grad_clip; + } + return coord; +} + +static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) { + return h >= 0 && h < H && w >= 0 && w < W; +} + +static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) { + return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; +} + +template +static inline scalar_t get_value_bounded( + const scalar_t* data, + scalar_t x, + scalar_t y, + int64_t W, + int64_t H, + int64_t sW, + int64_t sH, + GridSamplerPadding padding_mode, + bool align_corners) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int64_t ix = static_cast(x); + int64_t iy = static_cast(y); + + if (within_bounds_2d(iy, ix, H, W)) { + return data[iy * sH + ix * sW]; + } + return static_cast(0); +} + +template +static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w, + int64_t sH, int64_t sW, int64_t H, int64_t W, + scalar_t delta) { + if (within_bounds_2d(h, w, H, W)) { + data[h * sH + w * sW] += delta; + } +} + +template +static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w, + int64_t sD, int64_t sH, int64_t sW, + int64_t D, int64_t H, int64_t W, + scalar_t delta) { + if (within_bounds_3d(d, h, w, D, H, W)) { + data[d * sD + h * sH + w * sW] += delta; + } +} + +template +static inline void add_value_bounded( + scalar_t* data, + scalar_t x, + scalar_t y, + int64_t W, + int64_t H, + int64_t sW, + int64_t sH, + scalar_t delta, + GridSamplerPadding padding_mode, + bool align_corners) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int64_t ix = static_cast(x); + int64_t iy = static_cast(y); + + safe_add_2d(data, iy, ix, sH, sW, H, W, delta); +} + +// Calculate the differential of the cubic convolution, i.e. `d coeff / d x` +template +static inline void get_cubic_coefficients_grad( + scalar_t coeffs[4], + scalar_t t) { + + // Must be the same as forward calculation in + // aten/src/ATen/native/UpSample.h:get_cubic_upsample_coefficients + scalar_t A = -0.75; + + scalar_t x; + x = -1 - t; // 1 < x = |-1 - tx| < 2 + coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A; + x = -t; // x = |0 - tx| <= 1 + coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 1 - t; // x = |1 - tx| <= 1 + coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 2 - t; // 1 < x = |2 - tx| < 2 + coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A; +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/GridSamplerUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/GridSamplerUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..bd68648b3ebd1e1ae48f334659652323984c2dd6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/GridSamplerUtils.h @@ -0,0 +1,116 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// See NOTE: [Tensor vs. TensorBase] +// https://github.com/pytorch/pytorch/pull/66979 +#include +#include +#include + +namespace at::native { + +namespace detail { + +enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic}; +enum class GridSamplerPadding {Zeros, Border, Reflection}; + +} // namespace detail + +using detail::GridSamplerInterpolation; +using detail::GridSamplerPadding; + +// See NOTE [ grid_sampler Native Functions ]. +inline void check_grid_sampler_common( + const TensorBase& input, + const TensorBase& grid +) { + auto input_opt = input.options(); + auto grid_opt = grid.options(); + + TORCH_CHECK( + input.defined(), + "grid_sampler(): expected input to not be undefined"); + TORCH_CHECK( + grid.defined(), + "grid_sampler(): expected grid to not be undefined"); + TORCH_CHECK( + input_opt.device() == grid_opt.device(), + "grid_sampler(): expected input and grid to be on same device, but input " + "is on ", input_opt.device(), " and grid is on ", grid_opt.device()); + TORCH_CHECK( + input_opt.layout() == kStrided && grid_opt.layout() == kStrided, + "grid_sampler(): expected input and grid to have torch.strided layout, but " + "input has ", input_opt.layout(), " and grid has ", grid_opt.layout()); + TORCH_CHECK( + input.size(0) == grid.size(0), + "grid_sampler(): expected grid and input to have same batch size, but got " + "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes()); + TORCH_CHECK( + grid.size(-1) == input.dim() - 2, + "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last " + "dimension, but got grid with sizes ", grid.sizes()); + + for (const auto i : c10::irange(2, input.dim())) { + TORCH_CHECK(input.size(i) > 0, + "grid_sampler(): expected input to have non-empty spatial dimensions, " + "but input has sizes ", input.sizes(), " with dimension ", i, " being " + "empty"); + } +} + +// See NOTE [ grid_sampler Native Functions ]. +inline void check_grid_sampler_2d( + const TensorBase& input, + const TensorBase& grid +) { + TORCH_CHECK( + input.dim() == 4 && input.dim() == grid.dim(), + "grid_sampler(): expected 4D input and grid with same number of " + "dimensions, but got input with sizes ", input.sizes(), + " and grid with sizes ", grid.sizes()); +} + +// See NOTE [ grid_sampler Native Functions ]. +inline void check_grid_sampler_3d( + const TensorBase& input, + const TensorBase& grid, + int64_t interpolation_mode +) { + TORCH_CHECK( + input.dim() == 5 && input.dim() == grid.dim(), + "grid_sampler(): expected 5D input and grid with same number of " + "dimensions, but got input with sizes ", input.sizes(), + " and grid with sizes ", grid.sizes()); + TORCH_CHECK( + !(input.dim() == 5 && + static_cast(interpolation_mode) == + GridSamplerInterpolation::Bicubic), + "grid_sampler(): bicubic interpolation only supports 4D input"); +} + +// See NOTE [ grid_sampler Native Functions ]. +// cudnn does not support inputs larger than 1024. +inline bool cond_cudnn_grid_sampler( + const TensorBase& input, + const TensorBase& grid +) { + auto st = input.scalar_type(); + if (!(st == kDouble || st == kFloat || st == kHalf)) + return false; + st = grid.scalar_type(); + if (!(st == kDouble || st == kFloat || st == kHalf)) + return false; + return ( + at::native::cudnn_is_acceptable(input) && + at::native::cudnn_is_acceptable(grid) && + at::native::canUse32BitIndexMath(input) && + at::native::canUse32BitIndexMath(grid) && + input.dim() == 4 && + input.sym_size(1) <= 1024); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/GroupedMMUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/GroupedMMUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..e63c1acf163bddb4c0b1b5fb7c8ec658b3c0dd84 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/GroupedMMUtils.h @@ -0,0 +1,172 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native { + +inline bool check_valid_strides_and_return_transposed(const Tensor& mat) { + IntArrayRef tensor_strides = mat.strides(); + IntArrayRef tensor_sizes = mat.sizes(); + int end_dim = mat.dim() - 1; + int alignment = 16 / mat.element_size(); + TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n"); + if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max(1, tensor_sizes[end_dim - 1]))) { + TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes"); + return true; + } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max(1, tensor_sizes[end_dim]))) { + TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes"); + return false; + } else { + TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes"); + } +} + +inline at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a, +const Tensor& mat_b, +const std::optional& offs, +c10::ScalarType out_dtype +) { + c10::SmallVector out_size; + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (a_is_2d) { + if (b_is_2d) { + out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)}; + } else { + TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match"); + out_size = {mat_a.size(0), mat_b.size(-1)}; + } + } else { + if (b_is_2d) { + // this case is not actually encountered for MoE gemms + TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match"); + out_size = {mat_a.size(1), mat_b.size(1)}; + } else { // regular bmm + TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match"); + out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)}; + } + } + + #ifndef USE_ROCM + // For TMA transfers, strides of output tensor have to be either + // 1, or aligned to 16 bytes. + const auto last_dim = out_size.size() - 1; + const auto alignment = 16 / c10::elementSize(out_dtype); + const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment; + std::vector out_stride; + if (a_is_2d != b_is_2d) { + out_stride = {size_padded, 1}; + } else { + out_stride = {out_size[1] * size_padded, size_padded, 1}; + } + return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype)); + #else + return at::empty(out_size, mat_a.options().dtype(out_dtype)); + #endif +} + +inline void _grouped_mm_validate_inputs(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype) { + TORCH_CHECK((mat_a.dtype() == at::kBFloat16) || (mat_a.dtype() == at::kFloat) || (mat_a.dtype() == at::kHalf), "Expected mat_a to be Float32, BFloat16 or Float16 matrix, got ", mat_a.scalar_type()); + TORCH_CHECK((mat_b.dtype() == at::kBFloat16) || (mat_b.dtype() == at::kFloat) || (mat_b.dtype() == at::kHalf), "Expected mat_b to be Float32, BFloat16 or Float16 matrix, got ", mat_b.scalar_type()); + TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); + TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (!a_is_2d || !b_is_2d) { + TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match"); + } + + // check that the strides are valid, the fn will throw an error if not + check_valid_strides_and_return_transposed(mat_a); + check_valid_strides_and_return_transposed(mat_b); + TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d"); + + if (offs.has_value()) { + TORCH_CHECK(offs->dim() == 1, "offs has to be 1D"); + TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); + } + TORCH_CHECK(!bias.has_value(), "Bias not supported yet"); +} + +inline c10::ScalarType _resolve_grouped_mm_out_dtype(const Tensor& mat_a, const Tensor& mat_b, +std::optional out_dtype) { + const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); + // TODO(future PR): enable float32 output dtype for bfloat16 and float16 inputs + TORCH_CHECK(out_dtype_ == mat_a.dtype(), "Grouped gemm output dtype must match `mat_a` dtype"); + return out_dtype_; +} + + +inline void _grouped_mm_fallback(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype, +Tensor out) { + LOG(INFO) << "fallback path for `torch._grouped_mm`, performance may not be optimal"; + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (a_is_2d && !b_is_2d) { + // 2d x 3d with offsets + int group_start_idx = 0; + auto offs_cpu = offs.value().cpu(); + for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) { + int group_end_idx = offs_cpu[group_idx].item(); + auto mat_a_slice = mat_a.slice(0, group_start_idx, group_end_idx); + auto out_slice = out.slice(0, group_start_idx, group_end_idx); + at::mm_out(out_slice, mat_a_slice, mat_b[group_idx]); + group_start_idx = group_end_idx; + } + + } else if (!a_is_2d && b_is_2d) { + // 3d x 2d with offsets + int group_start_idx = 0; + auto offs_cpu = offs.value().cpu(); + for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) { + int group_end_idx = offs_cpu[group_idx].item(); + auto mat_b_slice = mat_b.slice(1, group_start_idx, group_end_idx); + auto out_slice = out.slice(1, group_start_idx, group_end_idx); + at::mm_out(out_slice, mat_a[group_idx], mat_b_slice); + group_start_idx = group_end_idx; + } + + } else if (a_is_2d && b_is_2d) { + // 2d x 2d with offsets + int group_start_idx = 0; + auto offs_cpu = offs.value().cpu(); + for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) { + int group_end_idx = offs_cpu[group_idx].item(); + auto mat_a_slice = mat_a.slice(1, group_start_idx, group_end_idx); + auto mat_b_slice = mat_b.slice(0, group_start_idx, group_end_idx); + auto out_slice = out[group_idx]; + at::mm_out(out_slice, mat_a_slice, mat_b_slice); + group_start_idx = group_end_idx; + } + + } else { + // 3d x 3d without offsets - regular bmm + at::bmm_out(out, mat_a, mat_b); + } +} + + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Histogram.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Histogram.h new file mode 100644 index 0000000000000000000000000000000000000000..7ef0b8c873a0e6d9c5bac1c8e7e495ad854c87ae --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Histogram.h @@ -0,0 +1,21 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +using histogramdd_fn = void(*)(const Tensor&, const std::optional&, bool, Tensor&, const TensorList&); +using histogramdd_linear_fn = void(*)(const Tensor&, const std::optional&, bool, Tensor&, const TensorList&, bool); +using histogram_select_outer_bin_edges_fn = void(*)(const Tensor& input, const int64_t N, std::vector &leftmost_edges, std::vector &rightmost_edges); + +DECLARE_DISPATCH(histogramdd_fn, histogramdd_stub) +DECLARE_DISPATCH(histogramdd_linear_fn, histogramdd_linear_stub) +DECLARE_DISPATCH(histogram_select_outer_bin_edges_fn, histogram_select_outer_bin_edges_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/IndexKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/IndexKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..00b2a30be157021b5d934108e68282ce5c51e708 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/IndexKernel.h @@ -0,0 +1,46 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +class Tensor; +class TensorBase; +struct TensorIterator; +struct TensorIteratorBase; +} + +namespace c10 { +class Scalar; +} + +namespace at::native { + +using index_fn = void(*)(TensorIteratorBase &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides); +using index_fill_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride, const Scalar& source); +using index_copy_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride); +using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate); +using put_fn = void(*)(TensorIterator & iter, const TensorBase& self, const bool accumulate); +using take_fn = void(*)(TensorIterator & iter, const TensorBase& input); +using flip_fn = void(*)(TensorIterator &, const bool); +using masked_fill_fn = void(*)(TensorIterator &, const Scalar& scalar); +using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride); +using masked_scatter_fn = void(*)(TensorIterator &, const TensorBase &); + +DECLARE_DISPATCH(index_fn, index_stub) +DECLARE_DISPATCH(index_fill_fn, index_fill_stub) +DECLARE_DISPATCH(index_copy_fn, index_copy_stub) +DECLARE_DISPATCH(index_put_fn, index_put_stub) +DECLARE_DISPATCH(put_fn, put_stub) +DECLARE_DISPATCH(take_fn, take_stub) +DECLARE_DISPATCH(flip_fn, flip_stub) +DECLARE_DISPATCH(masked_fill_fn, masked_fill_stub) +DECLARE_DISPATCH(masked_select_fn, masked_select_serial_stub) +DECLARE_DISPATCH(masked_select_fn, masked_select_stub) +DECLARE_DISPATCH(masked_scatter_fn, masked_scatter_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/IndexingUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/IndexingUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..f462528e7ffbfa9f87b5cacf8f6158948e8e1df6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/IndexingUtils.h @@ -0,0 +1,186 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + +namespace at::native { + +[[noreturn]] +static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, int64_t maskIdx) { + TORCH_CHECK_INDEX(false, "The shape of the mask ", mask.sizes(), " at index ", maskIdx, + " does not match the shape of the indexed tensor ", self.sizes(), " at index ", idx); +} + +[[maybe_unused]] static std::vector expandTensors( + const Tensor& self, + IOptTensorListRef indices, + bool ensure_same_device = false) { + // If indices come in as ByteTensor or BoolTensor (masks), expand them into + // the equivalent indexing by LongTensors + std::vector result; + for (const auto& index_opt : indices) { + if (!index_opt.has_value()) { + result.emplace_back(); + } else { + const auto& index = *index_opt; + if (index.scalar_type() == kByte || index.scalar_type() == kBool) { + if (index.scalar_type() == kByte) { + TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \ + " please use a dtype torch.bool instead."); + } + // The sizes of the ByteTensor mask or bool tensor must match the sizes of the + // corresponding dimensions in self + for (const auto j : c10::irange(index.dim())) { + int64_t srcIdx = static_cast(result.size() + j); + if (index.size(j) != self.size(srcIdx)) { + invalid_mask(self, srcIdx, index, j); + } + } + // Replace with nonzeros + at::Tensor nonzero; + if (ensure_same_device && index.device() != self.device()) { + bool non_blocking = index.is_cpu() && self.device().is_cuda(); + auto out = at::empty({0}, index.options().dtype(kLong).pinned_memory(non_blocking)); + nonzero = at::nonzero_out(out, index).to(self.device(), non_blocking); + } else { + nonzero = index.nonzero(); + } + for (const auto j : c10::irange(index.dim())) { + result.emplace_back(nonzero.select(1, j)); + } + } else if (ensure_same_device && index.device() != self.device()) { + result.emplace_back(index.to(self.device())); + } else { + result.emplace_back(index); + } + } + } + return result; +} + +[[maybe_unused]] static void checkIndexTensorTypes( + IOptTensorListRef indices, + bool allow_int = false) { + for (const auto& tensor : indices) { + if (tensor.has_value() && tensor->defined()) { + auto scalarType = tensor->scalar_type(); + if (allow_int) { + if (scalarType != kLong && scalarType != kByte && scalarType != kBool && scalarType != kInt) { + TORCH_CHECK_INDEX(false, "tensors used as indices must be long, int, byte or bool tensors"); + } + } else { + if (scalarType != kLong && scalarType != kByte && scalarType != kBool) { + TORCH_CHECK_INDEX(false, "tensors used as indices must be long, byte or bool tensors"); + } + } + } + } +} + +inline torch::List> toListOfOptionalTensors(ArrayRef list) { + torch::List> result; + result.reserve(list.size()); + for (const Tensor& a : list) { + result.push_back(a); + } + return result; +} + +inline torch::List> toListOfOptionalTensors(ArrayRef list) { + torch::List> result; + result.reserve(list.size()); + for (const IValue& a : list) { + result.push_back(a.isTensor() ? std::optional(a.toTensor()) : std::optional()); + } + return result; +} + +[[maybe_unused]] static bool hasContiguousSubspace(TensorList tl) { + // true if all the non-null tensors are adjacent + auto isDefined = [](const Tensor & tensor){ return tensor.defined(); }; + auto isNull = [](const Tensor & tensor){ return !tensor.defined(); }; + auto start = std::find_if(tl.begin(), tl.end(), isDefined); + auto stop = std::find_if(tl.rbegin(), tl.rend(), isDefined); + auto it = std::find_if(start, stop.base(), isNull); + return it == stop.base(); +} + +// Transposes the tensor and indices together so that all the non-null indices +// index the first k dimensions of the tensor. Returns the transposed tensor +// and the reordered indices. For example: +// transposeToFront(tensor, {nullptr, a, nullptr, b}) +// returns +// tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr} +[[maybe_unused]] static std::tuple> transposeToFront( + const Tensor& self, + TensorList indices) { + std::vector dims; + std::vector transposedIndices; + dims.reserve(self.dim()); + for (const auto i : c10::irange(self.dim())) { + if (indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(indices[i]); + } + } + for (const auto i : c10::irange(self.dim())) { + if (!indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(); + } + } + return std::make_tuple(self.permute(dims), std::move(transposedIndices)); +} + +inline std::tuple, std::vector> +transposeToFrontAndInvPerm(const Tensor& self, TensorList indices) { + std::vector dims; + std::vector invPerm; + std::vector transposedIndices; + dims.reserve(self.dim()); + invPerm.resize(self.dim()); + for (const auto i : c10::irange(self.dim())) { + if (indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(indices[i]); + } + } + for (const auto i : c10::irange(self.dim())) { + if (!indices[i].defined()) { + dims.push_back(i); + transposedIndices.emplace_back(); + } + } + for (const auto i : c10::irange(self.dim())) { + invPerm[dims[i]] = i; + } + return std::make_tuple(self.permute(dims), std::move(transposedIndices), std::move(invPerm)); +} + +struct AdvancedIndex { + AdvancedIndex(const Tensor& src, TensorList indices); + + Tensor src; + std::vector indices; + DimVector indexed_sizes; + DimVector indexed_strides; + int64_t dims_before; + int64_t dims_after; +}; + + +} //namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Lerp.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Lerp.h new file mode 100644 index 0000000000000000000000000000000000000000..9a51a5c712a9dacee54261aafa9e78d07445de67 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Lerp.h @@ -0,0 +1,51 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +template +C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(scalar_t weight) { + return std::abs(weight) < scalar_t(0.5); +} +template +C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(c10::complex weight) { + // Avoid the sqrt in abs(weight) + return (weight.real() * weight.real() + weight.imag() * weight.imag()) < scalar_t(0.25); +} + +template +C10_HOST_DEVICE C10_ALWAYS_INLINE scalar_t lerp(scalar_t self_, scalar_t end_, weight_t weight_) { + using opmath_t = at::opmath_type; + using opmath_weight_t = at::opmath_type; + + opmath_t self = self_; + opmath_t end = end_; + opmath_weight_t weight = weight_; + + // Conditional for better numeric. This has been discussed in + // https://github.com/pytorch/pytorch/pull/18871 + return is_lerp_weight_small(weight) + ? self + weight * (end - self) + : end - (end - self) * (opmath_t(1) - weight); +} + +using lerp_fn_scalar = void (*)( + at::TensorIteratorBase& iter, + const Scalar& weight); + +using lerp_fn_tensor = void (*)( + at::TensorIteratorBase& iter); + +DECLARE_DISPATCH(lerp_fn_scalar, lerp_kernel_scalar_weight) +DECLARE_DISPATCH(lerp_fn_tensor, lerp_kernel_tensor_weight) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/LinearAlgebra.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/LinearAlgebra.h new file mode 100644 index 0000000000000000000000000000000000000000..fa0e95971a5231ef8327f12511a25509a6e19f69 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/LinearAlgebra.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { +struct TensorIterator; +} + +namespace at::native { + +using addr_fn = void (*)(TensorIterator &, const Scalar& beta, const Scalar& alpha); +DECLARE_DISPATCH(addr_fn, addr_stub) +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..47633d336ff50c372081000540c37dca16e17650 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h @@ -0,0 +1,629 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#include +#endif + +namespace at::native { + +inline c10::MaybeOwned expect_resolved_conj(const Tensor& tensor) { + if (tensor.is_conj()) { + return c10::MaybeOwned::owned(tensor.resolve_conj()); + } else { + return c10::MaybeOwned::borrowed(tensor); + } +} + +inline DimVector batched_matrix_contiguous_strides( + const IntArrayRef sizes, + const bool f_contig = false) { + // f_contig chooses between the strides of a batch of Fortran (F-contiguous) + // and C-contiguous matrices + auto strides = c10::contiguous_strides(sizes); + auto dim = strides.size(); + + if (f_contig && dim >= 2) { + // Fix the strides of the last two dimensions, so that we return + // C-contiguous batches of F-contiguous matrices. + strides[dim - 1] = std::max(sizes[dim - 2], static_cast(1)); + strides[dim - 2] = 1; + } + return strides; +} + +/* + * Clones a Tensor so that the following conditions hold: + * If we think of a Tensor of having size (B, M, N), where B is any number + * of batch dimensions, then: + * - Each (M, N) matrix is in column major form + * - Let Tensor P have size (B, M, N) and Q have size (B, M', N'). + * Then when laid out in memory, the M by N matrix starting at + * P.data_ptr()[B * M * N] is of the same corresponding batch as the M' by N' + * matrix starting at Q.data_ptr()[B * M' * N']. + */ +inline Tensor cloneBatchedColumnMajor(const Tensor& src) { + // If src is already in batched column major format, then + // this will be efficient (no reordering of the data will occur) + // because the first transpose will make the tensor contiguous, + // and cloning a contiguous tensor is fast. + auto result = src.mT().clone(at::MemoryFormat::Contiguous); + result.transpose_(-2, -1); + return result; +} + +/* + * contig chooses between C-contig (true) and F-contig (false) + */ +inline c10::MaybeOwned borrow_else_clone(const bool cond, const Tensor& borrow, const Tensor& clone, const bool contig) { + return cond ? c10::MaybeOwned::borrowed(borrow) + : c10::MaybeOwned::owned(contig ? clone.clone(MemoryFormat::Contiguous) + : cloneBatchedColumnMajor(clone)); +} + +/* + * This method is designed to be a faster alternative to + * `cloneBatchedColumnMajor` with some additional features, + * namely: + * 1. It uses `copy` instead of `clone` which could be much faster. + * 2. `nrows` parameter used to create inputs with the number of rows larger + * than the original input, which is required for some LAPACK/MAGMA methods. + * 3. `desired_batch_size` is used to create copies with the batch size + * which is either the original batch size of the input, or its larger + * broadcasted shape. + */ +inline Tensor copyBatchedColumnMajor(const Tensor& src, int64_t nrows = -1, + at::OptionalIntArrayRef desired_batch_sizes = std::nullopt) { + nrows = (nrows == -1) ? src.size(-2) : nrows; + auto copy_sizes = desired_batch_sizes.has_value() + ? desired_batch_sizes.value().vec() + : IntArrayRef(src.sizes().data(), src.dim() - 2).vec(); + copy_sizes.insert(copy_sizes.end(), {nrows, src.size(-1)}); + const auto copy_strides = batched_matrix_contiguous_strides(copy_sizes, /*f-contig*/true); + auto copy = at::empty_strided(copy_sizes, copy_strides, src.options()); + copy.narrow(-2, 0, src.size(-2)).copy_(src); + return copy; +} + +/* + * Given batches of matrices with arbitrary batch dim, + * computes the number of batches. + */ +inline int64_t batchCount(const Tensor& batched_matrices) { + int64_t result = 1; + for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) { + result *= batched_matrices.size(i); + } + return result; +} + +// Computes the number of elements of a matrix in a batched matrix tensor +inline int64_t matrixStride(const Tensor& batched_matrices) { + return batched_matrices.size(-1) * batched_matrices.size(-2); +} + +// Validates input shapes for operations on batches of square matrices (inverse, cholesky, symeig, eig) +inline void checkIsMatrix(const Tensor& A, const char* const f_name, const char* const arg_name = "A") { + TORCH_CHECK(A.dim() >= 2, f_name, ": The input tensor ", arg_name, " must have at least 2 dimensions."); +} +inline void squareCheckInputs(const Tensor& self, const char* const f_name, const char* const arg_name = "A") { + checkIsMatrix(self, f_name, arg_name); + TORCH_CHECK(self.sym_size(-1) == self.sym_size(-2), + f_name, + ": ", arg_name, " must be batches of square matrices, " + "but they are ", self.sym_size(-2), " by ", self.sym_size(-1), " matrices"); +} + +inline void checkInputsSolver(const Tensor& A, + const Tensor& B, + const bool left, + const char* const f_name) { + squareCheckInputs(A, f_name, "A"); + checkIsMatrix(B, f_name, "B"); + TORCH_CHECK(left ? A.size(-2) == B.size(-2) : A.size(-1) == B.size(-1), + f_name, ": Incompatible shapes of A and B for the equation ", + left ? "AX = B" : "XA = B", + " (", A.size(-2), "x", A.size(-1), " and ", B.size(-2), "x", B.size(-1), ")"); +} + +inline bool is_row_or_column_contiguous(const Tensor& t) { + // This could be made more general, similar to how it's checked in matmul, which would allow to + // elide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky. + // We choose to be conservative for simplicity + return t.is_contiguous() || t.transpose(-2, -1).is_contiguous(); +} + +inline TransposeType to_transpose_type(const bool contig, const bool conj) { + if (conj) { + if (contig) { TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); } + else { return TransposeType::ConjTranspose; } + } else { + if (contig) { return TransposeType::NoTranspose; } + else { return TransposeType::Transpose; } + } +} + + +// This function is designed to be used with linear algebra methods that minimize +// L(ax - b) = 0, where L is generally the identity map (`solve`, for example) +// or the L2 norm (`lstsq`). +// It is expected that `a` and `b` are contiguous tensors of column-major matrices +// (so that a.view({-1, a.size(-2), a.size(-1)}) succeeds, same for `b`), +// with the following additional properties: +// +// 1. a.dim() == b.dim() +// 2. a.shape[:-2] broadcasts over b.shape[:-2] +// 3. a.size(i) <= b.size(i) for i=0,..., a.dim() - 3 (only for batch dimensions) +// +// MAGMA/LAPACK modify tensor `a` in-place, and the main goal of this method +// is to be memory efficient, which means that if there exists an index i such that +// a.shape[i] < b.shape[i], 0 <= i <= a.dim() - 3, +// then instead of materializing copies of `a` in the broadcasted shape, we keep +// a buffer copy of `a` along with flags that check whether specific batch dimension +// indices for `a` were already accessed. If they were, we copy the data from the buffer +// into `a`. The number of copies does not exceed +// prod(max(a.shape[:-2], b.shape[:-2]) - a.shape[:-2] + 1) +// and this value is attained by tensors with non-empty batch dimensions. +// +// func_t `f` is a callable that is being supplied with +// scalar_t* a_working_ptr, scalar_t* b_working_ptr, int64_t a_linear_batch_idx. +// a_working_ptr and b_working_ptr can directly be passed to LAPACK/MAGMA routines, +// and a_linear_batch_idx is an index in the 3d representation which corresponds to +// the memory a_working_ptr points to, in other words: +// a_working_ptr == a.view({-1, a.size(-2), a.size(-1)}.select(0, a_linear_batch_idx).data_ptr(); +// a_linear_batch_idx is useful to store metadata related to `a`, such as, for example, +// its rank or singular values (see linalg_lstsq). +template +void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const func_t& f) { + IntArrayRef a_batch_sizes(a.sizes().data(), a.dim() - 2); + IntArrayRef b_batch_sizes(b.sizes().data(), b.dim() - 2); + + auto a_linear_batch_idx = at::arange(batchCount(a)).view(a_batch_sizes); + auto b_linear_batch_idx = at::arange(batchCount(b)).view(b_batch_sizes); + + TensorIterator iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .add_output(b_linear_batch_idx) + .add_input(a_linear_batch_idx) + .build(); + + auto m = a.size(-2); + auto n = a.size(-1); + auto a_3d = a.view({batchCount(a), m, n}); + auto b_3d = b.view({batchCount(b), b.size(-2), b.size(-1)}); + + auto a_broadcasts_over_b = (a_batch_sizes != b_batch_sizes); + Tensor a_buffer, a_was_accessed, a_buffer_3d; + std::function check_if_copy_needed_for_a + = [](int64_t /*a_curr_linear_batch_idx*/){}; + if (a_broadcasts_over_b) { + a_buffer = at::empty_strided(a.sizes(), a.strides(), a.options()) + .copy_(a); + a_was_accessed = at::zeros(batchCount(a), at::kBool); + a_buffer_3d = a_buffer.view({batchCount(a), m, n}); + check_if_copy_needed_for_a = [&](int64_t a_curr_linear_batch_idx) { + auto* a_was_accessed_flag = a_was_accessed + .select(0, a_curr_linear_batch_idx) + .data_ptr(); + if (!(*a_was_accessed_flag)) { + *a_was_accessed_flag = true; + } + else { + a_3d.select(0, a_curr_linear_batch_idx) + .copy_(a_buffer_3d.select(0, a_curr_linear_batch_idx)); + } + }; + } + + auto loop = [&](char** data, const int64_t* strides, int64_t nelems) { + auto* b_batch_idx_ptr = data[0]; + auto* a_batch_idx_ptr = data[1]; + + for ([[maybe_unused]] const auto elem : c10::irange(nelems)) { + auto b_curr_linear_batch_idx = + *reinterpret_cast(b_batch_idx_ptr); + auto a_curr_linear_batch_idx = *reinterpret_cast(a_batch_idx_ptr); + + check_if_copy_needed_for_a(a_curr_linear_batch_idx); + + auto* a_working_ptr = a_3d.select(0, a_curr_linear_batch_idx) + .data_ptr(); + auto* b_working_ptr = b_3d.select(0, b_curr_linear_batch_idx) + .data_ptr(); + f(a_working_ptr, b_working_ptr, a_curr_linear_batch_idx); + + b_batch_idx_ptr += strides[0]; + a_batch_idx_ptr += strides[1]; + } + }; + iter.serial_for_each(loop, {0, batchCount(b)}); +} + +// Returns the epsilon value for floating types except half +inline double _get_epsilon(const ScalarType& sc_type) { + switch (sc_type) { + case at::ScalarType::Float: + return static_cast(std::numeric_limits::epsilon()); + case at::ScalarType::Double: + return std::numeric_limits::epsilon(); + default: + TORCH_CHECK(false, "This function doesn't handle types other than float and double"); + } +} + +// Validates input shapes and devices +// for linear solve methods (solve, cholesky_solve, lu_solve, triangular_solve) +inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A, const char* name) { + TORCH_CHECK(self.device() == A.device(), + "Expected b and A to be on the same device, but found b on ", + self.device(), " and A on ", A.device(), " instead."); + + TORCH_CHECK(self.scalar_type() == A.scalar_type(), + "Expected b and A to have the same dtype, but found b of type ", + self.scalar_type(), " and A of type ", A.scalar_type(), " instead."); + + TORCH_CHECK(A.size(-1) == A.size(-2), + "A must be batches of square matrices, " + "but they are ", A.size(-2), " by ", A.size(-1), " matrices"); + + TORCH_CHECK(A.size(-1) == self.size(-2), + "Incompatible matrix sizes for ", name, ": each A " + "matrix is ", A.size(-1), " by ", A.size(-1), + " but each b matrix is ", self.size(-2), " by ", self.size(-1)); +} + +inline void checkFloatingOrComplex(const Tensor& t, const char* const f_name, const bool allow_low_precision_dtypes=true) { + auto dtype = t.scalar_type(); + TORCH_CHECK((at::isFloatingType(dtype) || at::isComplexType(dtype)), + f_name, ": Expected a floating point or complex tensor as input. Got ", dtype); + if (!allow_low_precision_dtypes) { + TORCH_CHECK(dtype == kFloat || dtype == kDouble || dtype == kComplexFloat || dtype == kComplexDouble, + f_name, ": Low precision dtypes not supported. Got ", dtype); + } +} + + +// Checks if all the Tensors in a TensorList are of the same dimensions +inline void checkAllSameDim(TensorList tensors, int64_t dim) { + for (auto &t : tensors) { + TORCH_CHECK(t.dim() == dim, "Tensor dimension is ", t.dim(), ", expected ", dim, " instead."); + } +} + +inline std::tuple, std::vector> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) { + // broadcast the batch dimensions of arg1 and arg2. + IntArrayRef arg1_batch_sizes(arg1.sizes().data(), arg1.ndimension() - 2); + IntArrayRef arg2_batch_sizes(arg2.sizes().data(), arg2.ndimension() - 2); + std::vector expand_batch_portion = infer_size(arg1_batch_sizes, arg2_batch_sizes); + + std::vector arg1_expand_size({expand_batch_portion}); + arg1_expand_size.insert(arg1_expand_size.end(), { arg1.size(-2), arg1.size(-1) }); + + std::vector arg2_expand_size({expand_batch_portion}); + arg2_expand_size.insert(arg2_expand_size.end(), { arg2.size(-2), arg2.size(-1) }); + return std::make_tuple(std::move(arg1_expand_size), std::move(arg2_expand_size)); +} + +inline std::tuple _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) { + // If there's no name we assume we don't want to check the errors + if (name != nullptr) { + linearSolveCheckInputs(arg1, arg2, name); + } + + auto [arg1_expand_size, arg2_expand_size] = at::native::_linalg_broadcast_batch_dims(arg1, arg2); + + auto arg1_broadcasted = arg1_expand_size == arg1.sizes() ? arg1 : arg1.expand(arg1_expand_size); + auto arg2_broadcasted = arg2_expand_size == arg2.sizes() ? arg2 : arg2.expand(arg2_expand_size); + return std::make_tuple(arg1_broadcasted, arg2_broadcasted); +} + +inline std::vector broadcast_batch_size(const Tensor& t1, const Tensor& t2, int64_t n_batch_dims) { + IntArrayRef t1_batch_sizes(t1.sizes().data(), n_batch_dims); + IntArrayRef t2_batch_sizes(t2.sizes().data(), n_batch_dims); + auto broadcasted_batch_sizes = infer_size(t1_batch_sizes, t2_batch_sizes); + return broadcasted_batch_sizes; +} + +// Return a permutation with the given axes moved to the end. +inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) { + const std::vector a = axes.vec(); + const int64_t ndim = self.ndimension(); + std::vector perm; + + for (const auto i : c10::irange(ndim)) { + auto it = std::find(a.begin(), a.end(), i); + if (it == a.end()) { + perm.push_back(i); + } + } + for (auto i : a) { + perm.push_back(i); + } + + TORCH_CHECK((int64_t)perm.size() == ndim, + "duplicate or invalid axis in 'dim' argument for tensor with ndim==", ndim); + + return self.permute(perm); +} + +// parse the "mode" param in linalg_qr: return a tuple of bools (compute_q, reduced) +inline std::tuple _parse_qr_mode(std::string_view mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; // this is actually irrelevant in this mode + } else { + TORCH_CHECK(false, "qr received unrecognized mode '", mode, + "' but expected one of 'reduced' (default), 'r', or 'complete'"); + } + return std::make_tuple(compute_q, reduced); +} + +// Function to compute sizes, strides and the extra columns for the Q matrix in the QR Decomposition +inline std::tuple _compute_geometry_for_Q( + const Tensor& input, + bool reduced) { + int64_t m = input.size(-2), n = input.size(-1); + int64_t n_columns_q; + + // We need to compute the required size of Q based on the `reduced` option + DimVector q_sizes(input.sizes()); + if (!reduced && m > n) { + q_sizes[input.dim() - 1] = m; + n_columns_q = m; + } else { + q_sizes[input.dim() - 1] = n; + n_columns_q = std::min(m, n); + } + auto q_strides = batched_matrix_contiguous_strides(q_sizes, /*f-contig*/true); + return std::make_tuple(q_sizes, q_strides, n_columns_q); +} + +inline bool svd_uses_cusolver(const Tensor& A) { + // if cusolver is available, it is used unconditionally + return A.is_cuda() + && at::globalContext().hasCuSOLVER() + && at::globalContext().linalgPreferredBackend() != at::LinalgBackend::Magma; +} + + +// Function used instead of .to so that the original strides are retained +// .to doesn't retain strides and make the output tensor contiguous +inline Tensor same_stride_to(const Tensor& original_tensor, const at::TensorOptions& options) { + auto strided_to = at::empty_strided(original_tensor.sizes(), + original_tensor.strides(), + options); + strided_to.copy_(original_tensor); + return strided_to; +} + +// Creates a dimension permutation array that can be given to `at::permute()`, which will shift +// the two specified dimensions to the end of a tensor, without changing the order of +// the other dimensions. `dim1` will be placed at the very end, and `dim0` will be +// placed just to the left of it. +// +// For instance, given a 4-D tensor, dimensions 1 and 3 can be shifted to the end by +// calling `create_dim_backshift_permutation(1, 3, 4)`. The resulting vector will +// be `vec(0, 2, 1, 3)`. +inline std::vector create_dim_backshift_permutation(int64_t dim0, int64_t dim1, int64_t ndim) { + TORCH_CHECK( + (dim0 != dim1) && (dim0 < ndim) && (dim0 >= 0) && (dim1 < ndim) && (dim1 >= 0), + "duplicate or invalid dimensions"); + std::vector permutation(ndim); + int64_t cur_permuted_dim = 0; + for (const auto dim_ind : c10::irange(ndim)) { + if ((dim_ind != dim0) && (dim_ind != dim1)) { + permutation[cur_permuted_dim++] = dim_ind; + } + } + permutation[cur_permuted_dim++] = dim0; + permutation[cur_permuted_dim] = dim1; + return permutation; +} + +// Creates a dimension permutation array that can be given to `at::permute()`, which +// will reverse a given permutation. +// The reverse permutation array is created by swapping the indices and their +// associated values from the given permutation array. +inline std::vector create_reverse_permutation(std::vector permutation) { + int64_t ndim = permutation.size(); + std::vector reverse_permutation(ndim); + for (const auto dim_ind : c10::irange(ndim)) { + reverse_permutation[permutation[dim_ind]] = dim_ind; + } + return reverse_permutation; +} + +// Compute R-work array size for MAGMA/LAPACK cgesdd/zgesdd +// See https://github.com/Reference-LAPACK/lapack/blob/122506cd8b6ce050a200920c3d4c0b153b150fd8/SRC/cgesdd.f#L186 +inline int64_t computeLRWorkDim(const char jobz, int64_t m, int64_t n) { + auto mn = std::min(m, n); + auto mx = std::max(m, n); + if (jobz == 'N') { +#ifdef __APPLE__ + // According to `vecLib.framework/Headers/clapack.h` Accelerate.framework is based on LAPACK 3.2.1 + return 7 * mn; +#else + // These setting is valid for on LAPACK 3.6+ + return 5 * mn; +#endif + } + if (mx > 10 * mn) { + return 5 * mn * mn + 5 * mn; + } + return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn); +} + +// This function checks whether the uplo argument input is valid +// Allowed strings are "u", "U", "l", "L" +inline void checkUplo(const std::string_view uplo) { + // To use std::toupper safely with plain chars (or signed chars), the argument should first be converted to unsigned char + char uplo_uppercase = static_cast(std::toupper(static_cast(uplo[0]))); + TORCH_CHECK(uplo.size() == 1 && (uplo_uppercase == 'U' || uplo_uppercase == 'L'), + "Expected UPLO argument to be 'L' or 'U', but got ", uplo); +} + +inline void checkSameDevice(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") { + TORCH_CHECK( + result.device() == input.device(), + fn_name, + ": Expected ", result_name, " and input tensors to be on the same device, but got ", + result_name, " on ", result.device(), " and input on ", input.device()); +} + +// Check the dtype of result and input tensors (for _out variants). +// Most linear algebra functions have the same dtype for input and output +// (either floating or complex type input), so we can check whether input's dtype can be casted to result's dtype. +// According to https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-does-out-work-in-pytorch +// c10::canCast is used for checking the "safe copy" dtype requirements. +inline void checkLinalgCompatibleDtype(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") { + bool can_cast = c10::canCast(input.scalar_type(), result.scalar_type()); + TORCH_CHECK( + can_cast, + fn_name, + ": Expected ", result_name, " to be safely castable from ", input.scalar_type(), " dtype, but got ", + result_name, " with dtype ", result.scalar_type()); +} + +// Alternatively, we can check whether the specific expected output type (result_type) can be safely casted to out tensor dtype (out_type) +inline void checkLinalgCompatibleDtype(const std::string& fn_name, ScalarType out_type, ScalarType result_type, const std::string& out_name = "result") { + bool can_cast = c10::canCast(result_type, out_type); + TORCH_CHECK( + can_cast, + fn_name, + ": Expected ", out_name, " to be safely castable from ", result_type, " dtype, but got ", + out_name, " with dtype ", out_type); +} + +inline void checkNotComplexTolerance(const Tensor& tol, const std::string_view f_name, const std::string_view tol_name) { + TORCH_CHECK(!at::isComplexType(tol.scalar_type()), + f_name, ": ", tol_name, " tensor of complex type is not supported. Got ", tol.scalar_type()); +} + +/* + Two types of 'other' tensors are supported when solving + a system of linear equations matmul(input, x) = other: + * 1-dimensional (1D) tensor or batch of 1D tensors (vector case) + * 2-dimensional (2D) tensor or batch of 2D tensors (matrix case). + The original torch.solve supported only the matrix case, while NumPy works for both cases. + For the batched input we need to be able to distinguish them. + Let input.shape = (batch_dimensions, m, n), then 'other' is of vector type if other.shape == (batch_dimensions, m). + This rule is compatible with NumPy, see https://github.com/numpy/numpy/blob/v1.20.0/numpy/linalg/linalg.py#L384-L389 +*/ +inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor& other) { + auto expected_batched_rhs_shape = SymIntArrayRef(input.sym_sizes().data(), input.dim() - 1); // input.shape[:-1] + bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sym_sizes().equals(expected_batched_rhs_shape)); + return vector_case; +} + +/* + Computes linear indices for a tensor with original_shape to access its elements like it was a materialized broadcast tensor. +*/ +inline Tensor get_linear_indices(int64_t numel, IntArrayRef original_shape, IntArrayRef broadcast_shape) { + TensorOptions options = at::TensorOptions().dtype(at::kLong).device(at::kCPU); + return at::arange(numel, options).view(original_shape).broadcast_to(broadcast_shape).contiguous(); +} + +class BroadcastLinearIndices { + private: + Tensor linear_indices_; + bool is_broadcasting_; + + public: + BroadcastLinearIndices( + int64_t numel, + IntArrayRef original_shape, + IntArrayRef broadcast_shape) : is_broadcasting_(!original_shape.equals(broadcast_shape)) { + // The assumption is that the broadcast_shape is a materialized broadcast + // shape of the original_shape. We need to compute the linear indices + // compatible with the original_shape to access the elements in the original + // tensor corresponding to the broadcast tensor. + if (is_broadcasting_) { + linear_indices_ = + get_linear_indices(numel, original_shape, broadcast_shape); + } + } + int64_t operator()(int64_t broadcast_linear_index) { + return is_broadcasting_ + ? linear_indices_.data_ptr()[broadcast_linear_index] + : broadcast_linear_index; + } +}; + +inline bool is_blas_compatible_column_major_order(const Tensor& input) { + IntArrayRef input_strides = input.strides(); + IntArrayRef input_sizes = input.sizes(); + auto ndim = input.dim(); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2); + if (ndim > 3) { + return input.transpose(-2, -1).is_contiguous(); + } + auto leading_dimension = input_strides[ndim - 1]; + auto rows = input_sizes[ndim - 2]; + bool batch_stride_compatible = true; + if (ndim == 3) { + auto cols = input_sizes[ndim - 1]; + batch_stride_compatible = + input_strides[ndim - 3] >= leading_dimension * cols; + } + return (input_strides[ndim - 2] == 1) && + (leading_dimension >= std::max(1, rows)) && + batch_stride_compatible; +} + +inline bool is_blas_compatible_row_major_order(const Tensor& input) { + IntArrayRef input_strides = input.strides(); + IntArrayRef input_sizes = input.sizes(); + auto ndim = input.dim(); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2); + if (ndim > 3) { + return input.is_contiguous(); + } + auto leading_dimension = input_strides[ndim - 2]; + auto cols = input_sizes[ndim - 1]; + bool batch_stride_compatible = true; + if (ndim == 3) { + auto rows = input_sizes[ndim - 2]; + batch_stride_compatible = + input_strides[ndim - 3] >= leading_dimension * rows; + } + return (input_strides[ndim - 1] == 1) && + (leading_dimension >= std::max(1, cols)) && + batch_stride_compatible; +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/LossMulti.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/LossMulti.h new file mode 100644 index 0000000000000000000000000000000000000000..2a8b87e937a671c5fe04aea722502eda558f773b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/LossMulti.h @@ -0,0 +1,74 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +namespace at::native { + inline void multilabel_margin_loss_shape_check( + int64_t& nframe, + int64_t& dim, + const int64_t& ndims, + const Tensor& input, + const Tensor& target) { + TORCH_CHECK( + (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0, + "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ", + input.sizes()); + + if (ndims <= 1) { + nframe = 1; + dim = ndims == 0 ? 1 : input.size(0); + TORCH_CHECK( + target.dim() <= 1 && target.numel() == dim, + "inconsistent target size: ", target.sizes(), " for input of size: ", + input.sizes()); + } else { + nframe = input.size(0); + dim = input.size(1); + TORCH_CHECK( + target.dim() == 2 && target.size(0) == nframe && + target.size(1) == dim, + "inconsistent target size: ", target.sizes(), " for input of size: ", + input.sizes()); + } + } + + inline void multi_margin_loss_shape_check( + int64_t& nframe, + int64_t& dim, + const int64_t& ndims, + const Tensor& input, + const Tensor& target, + const std::optional& weight) { + TORCH_CHECK( + (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0, + "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ", + input.sizes()); + + if (ndims <= 1) { + nframe = 1; + dim = ndims == 0 ? 1 : input.size(0); + } else { + nframe = input.size(0); + dim = input.size(1); + } + + TORCH_CHECK( + target.dim() <= 1 && target.numel() == nframe, + "inconsistent target size, expected ", nframe, " but got ", + target.sizes()); + if (weight && weight->defined()) { + TORCH_CHECK( + weight->dim() <= 1 && weight->numel() == dim, + "inconsistent weight size, expected ", dim, " but got ", + weight->sizes()); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Math.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Math.h new file mode 100644 index 0000000000000000000000000000000000000000..fed8e3e14cbff16108cce5c98e07ba4e276d8066 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Math.h @@ -0,0 +1,3932 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion") +#endif + +/* The next function is taken from https://github.com/antelopeusersgroup/antelope_contrib/blob/master/lib/location/libgenloc/erfinv.c. +Below is the copyright. +Output was modified to be inf or -inf when input is 1 or -1. */ + + +/* + Copyright (c) 2014 Indiana University + All rights reserved. + + Written by Prof. Gary L. Pavlis, Dept. of Geol. Sci., + Indiana University, Bloomington, IN + + This software is licensed under the New BSD license: + + Redistribution and use in source and binary forms, + with or without modification, are permitted provided + that the following conditions are met: + + Redistributions of source code must retain the above + copyright notice, this list of conditions and the + following disclaimer. + + Redistributions in binary form must reproduce the + above copyright notice, this list of conditions and + the following disclaimer in the documentation and/or + other materials provided with the distribution. + + Neither the name of Indiana University nor + the names of its contributors may be used to endorse + or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. +*/ + +namespace { +/* + * This function is derived from the implementation of the i0e function in the + * Cephes Math Library. See note [3-Clause BSD License for the Cephes Math + * Library]. + * + * Computes an approximation of the exponentially scaled zeroth order modified + * Bessel function of the first kind. The approximation is actually two + * (sub)approximations, both using a Chebyshev polynomial expansion. One + * approximates the function over [0, 8], and the other over (8, infinity). This + * function takes the absolute value of all inputs to convert them into the + * domain of the approximation. + */ +jiterator_also_stringify_as(jiterator_code( + template + JITERATOR_HOST_DEVICE T chbevl(T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + JITERATOR_HOST_DEVICE T calc_i0e(T _x) { + T x = std::fabs(_x); + + if (x <= T{8.0}) { + static const T coefficients[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + T y = (x / T{2.0}) - T{2.0}; + return chbevl(y, coefficients, int{30}); + } + + // x > 8 + static const T coefficients[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return chbevl(T{32.0} / x - T{2.0}, coefficients, int{25}) / std::sqrt(x); + }), + i0e_string) // i0e_string +} + +#define CENTRAL_RANGE 0.7 + +template +inline typename std::enable_if_t, T> +calc_erfinv(T y) { +/* Function to calculate inverse error function. Rational approximation +is used to generate an initial approximation, which is then improved to +full accuracy by two steps of Newton's method. Code is a direct +translation of the erfinv m file in matlab version 2.0. +Author: Gary L. Pavlis, Indiana University +Date: February 1996 +*/ + T x, z, num, dem; /*working variables */ + /* coefficients in rational expansion */ + T a[4] = { T(0.886226899), T(-1.645349621), T(0.914624893), T(-0.140543331) }; + T b[4] = { T(-2.118377725), T(1.442710462), T(-0.329097515), T(0.012229801) }; + T c[4] = { T(-1.970840454), T(-1.624906493), T(3.429567803), T(1.641345311) }; + T d[2] = { T(3.543889200), T(1.637067800) }; + T y_abs = std::abs(y); + if(y_abs > 1.0) return std::numeric_limits::quiet_NaN(); +#ifdef _WIN32 + // error C2039: '_copysign': is not a member of 'std' + if(y_abs == 1.0) return copysign(std::numeric_limits::infinity(), y); +#else + if(y_abs == 1.0) return std::copysign(std::numeric_limits::infinity(), y); +#endif + if(y_abs <= static_cast(CENTRAL_RANGE)) { + z = y * y; + num = (((a[3]*z + a[2])*z + a[1])*z + a[0]); + dem = ((((b[3]*z + b[2])*z + b[1])*z +b[0]) * z + static_cast(1.0)); + x = y * num / dem; + } + else{ + z = std::sqrt(-std::log((static_cast(1.0)-y_abs)/static_cast(2.0))); + num = ((c[3]*z + c[2])*z + c[1]) * z + c[0]; + dem = (d[1]*z + d[0])*z + static_cast(1.0); +#ifdef _WIN32 + // error C2039: '_copysign': is not a member of 'std' + x = copysign(num, y) / dem; +#else + x = std::copysign(num, y) / dem; +#endif + } + /* Two steps of Newton-Raphson correction */ + x = x - (std::erf(x) - y) / ((static_cast(2.0)/static_cast(std::sqrt(c10::pi)))*std::exp(-x*x)); + x = x - (std::erf(x) - y) / ((static_cast(2.0)/static_cast(std::sqrt(c10::pi)))*std::exp(-x*x)); + + return x; +} + +#undef CENTRAL_RANGE + +/* + * Note [3-Clause BSD License for the Cephes Math Library] + * Code derived from implementations in the Cephes Math Library should mention its derivation and reference + * this note (ex. 'This function is derived from the implementation of X in the Cephes Math Library. See note + * [3-Clause BSD License for the Cephes Math Library]. The license is: + * Copyright (c) 2018, Steven Moshier + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Steven Moshier BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This function is derived from the implementation of the zeta function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + */ +template +C10_HOST_DEVICE inline scalar_t zeta(scalar_t x, scalar_t q) __ubsan_ignore_float_divide_by_zero__ { + using acc_t = at::acc_type; + const acc_t MACHEP = acc_t{1.11022302462515654042E-16}; + constexpr acc_t zero = acc_t{0.0}; + constexpr acc_t half = acc_t{0.5}; + constexpr acc_t one = acc_t{1.0}; + static const acc_t A[] = { + 12.0, + -720.0, + 30240.0, + -1209600.0, + 47900160.0, + -1.8924375803183791606e9, /*1.307674368e12/691*/ + 7.47242496e10, + -2.950130727918164224e12, /*1.067062284288e16/3617*/ + 1.1646782814350067249e14, /*5.109094217170944e18/43867*/ + -4.5979787224074726105e15, /*8.028576626982912e20/174611*/ + 1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/ + -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/ + }; + + acc_t a, b, k, s, t, w; + if (x == one) { + return std::numeric_limits::infinity(); + } + + if (x < one) { + return std::numeric_limits::quiet_NaN(); + } + + if (q <= zero) { + if (q == std::floor(q)) { + return std::numeric_limits::infinity(); + } + if (x != std::floor(x)) { + return std::numeric_limits::quiet_NaN(); + } + } + + s = std::pow(q, -x); + a = q; + int i = 0; + b = zero; + while ((i < 9) || (a <= acc_t{9.0})) { + i += 1; + a += one; + b = ::pow(a, -x); + s += b; + if ((-MACHEP * s < b) && (b < MACHEP * s)) { + return static_cast(s); + } + }; + + w = a; + s += b * w / (x - one); + s -= half * b; + a = one; + k = zero; + for (i = 0; i < 12; i++) { + a *= x + k; + b /= w; + t = a * b / A[i]; + s = s + t; + t = ::fabs(t / s); + if (t < MACHEP) { + return static_cast(s); + } + k += one; + a *= x + k; + b /= w; + k += one; + } + return static_cast(s); +} + +/* + * This function is derived from the implementation of the digamma function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + */ +template +C10_HOST_DEVICE inline T polevl(const T x, const T A[], size_t len) { + T result = 0; + for (size_t i = 0; i <= len; i++) { + result = result * x + A[i]; + } + return result; +} + +inline double trigamma(double x) __ubsan_ignore_float_divide_by_zero__ { + double sign = +1; + double result = 0; + if (x < 0.5) { + sign = -1; + const double sin_pi_x = sin(c10::pi * x); + result -= (c10::pi * c10::pi) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + for (int i = 0; i < 6; ++i) { + result += 1 / (x * x); + x += 1; + } + const double ixx = 1 / (x*x); + result += (1 + 1 / (2*x) + ixx * (1./6 - ixx * (1./30 - ixx * (1./42)))) / x; + return sign * result; +} + +inline float trigamma(float x) __ubsan_ignore_float_divide_by_zero__ { + float sign = +1; + float result = 0; + if (x < 0.5f) { + sign = -1; + const float sin_pi_x = sinf(c10::pi * x); + result -= (c10::pi * c10::pi) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + for (int i = 0; i < 6; ++i) { + result += 1 / (x * x); + x += 1; + } + const float ixx = 1 / (x*x); + result += (1 + 1 / (2*x) + ixx * (1.f/6 - ixx * (1.f/30 - ixx * (1.f/42)))) / x; + return sign * result; +} + +/* + * This function is derived from the implementation of the digamma function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + */ +inline double calc_digamma(double x) { + // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma + static double PSI_10 = 2.25175258906672110764; + if (x == 0) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is ±0, ±∞ is returned + return std::copysign(INFINITY, -x); + } + + bool x_is_integer = x == trunc(x); + if (x < 0) { + if (x_is_integer) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is a negative integer, NaN is returned + return std::numeric_limits::quiet_NaN(); + } + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = std::modf(x, &q); + return calc_digamma(1 - x) - c10::pi / tan(c10::pi * r); + } + + // Push x to be >= 10 + double result = 0; + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return result + PSI_10; + } + + // Compute asymptotic digamma + static const double A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + double y = 0; + if (x < 1.0e17) { + double z = 1.0 / (x * x); + y = z * polevl(z, A, 6); + } + return result + log(x) - (0.5 / x) - y; +} + +/* + * This function is derived from the implementation of the digamma function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + */ +inline float calc_digamma(float x) { + // See [C++ Standard Reference: Gamma Function] + static float PSI_10 = 2.25175258906672110764f; + if (x == 0) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is ±0, ±∞ is returned + return std::copysign(INFINITY, -x); + } + + bool x_is_integer = x == truncf(x); + if (x < 0) { + if (x_is_integer) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is a negative integer, NaN is returned + return std::numeric_limits::quiet_NaN(); + } + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = std::modf(x, &q); + float pi_over_tan_pi_x = (float)(c10::pi / tan(c10::pi * r)); + return calc_digamma(1 - x) - pi_over_tan_pi_x; + } + + // Push x to be >= 10 + float result = 0; + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return result + PSI_10; + } + + // Compute asymptotic digamma + static const float A[] = { + 8.33333333333333333333E-2f, + -2.10927960927960927961E-2f, + 7.57575757575757575758E-3f, + -4.16666666666666666667E-3f, + 3.96825396825396825397E-3f, + -8.33333333333333333333E-3f, + 8.33333333333333333333E-2f, + }; + + float y = 0; + if (x < 1.0e17f) { + float z = 1 / (x * x); + y = z * polevl(z, A, 6); + } + return result + logf(x) - (0.5f / x) - y; +} + +inline c10::BFloat16 calc_digamma(c10::BFloat16 a) { + return calc_digamma(static_cast(a)); +} + +inline c10::Half calc_digamma(c10::Half a) { + return calc_digamma(static_cast(a)); +} + +template +inline C10_HOST_DEVICE scalar_t calc_polygamma(scalar_t x, int n) { + // already blocked if n <= 1 + const auto one = scalar_t{1}; + return ((n % 2) ? one : -one) * + std::exp(std::lgamma(static_cast(n) + one)) * + zeta(static_cast(n + 1), x); +} + +// regularized lower incomplete gamma +// the regularized lower, upper incomplete gamma, as well as their +// helper functions follow SciPy's implementation + +/* References + * [igam1] "The Digital Library of Mathematical Functions", dlmf.nist.gov + * [igam2] Maddock et al., "Incomplete Gamma Functions", + * https://www.boost.org/doc/libs/1_61_0/libs/math/doc/html/math_toolkit/sf_gamma/igamma.html + */ + +/* + * This implementation of the regularized incomplete gamma functions and + * their helper functions are derived from the implementation of SciPy's + * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations. + * See NOTICE for the licenses. + */ +template +scalar_t ratevl(scalar_t x, const scalar_t num[], int64_t M, + const scalar_t denom[], int64_t N) { + // evaluating rational function, i.e., the ratio of two polynomials + // the coefficients for numerator are given by `num` while coeffs for + // denumerator are given by `denom` + + int64_t i, dir; + scalar_t y, num_ans, denom_ans; + scalar_t absx = std::fabs(x); + const scalar_t *p; + + if (absx > 1) { + /* Evaluate as a polynomial in 1/x. */ + dir = -1; + p = num + M; + y = 1 / x; + } + else { + dir = 1; + p = num; + y = x; + } + + /* Evaluate the numerator */ + num_ans = *p; + p += dir; + for (i = 1; i <= M; i++) { + num_ans = num_ans * y + *p; + p += dir; + } + /* Evaluate the denominator */ + if (absx > 1) { + p = denom + N; + } + else { + p = denom; + } + + denom_ans = *p; + p += dir; + for (i = 1; i <= N; i++) { + denom_ans = denom_ans * y + *p; + p += dir; + } + if (absx > 1) { + i = N - M; + return std::pow(x, i) * num_ans / denom_ans; + } + else { + return num_ans / denom_ans; + } +} + +// SciPy's lanczos implementation is taken from Boost +/* (C) Copyright John Maddock 2006. + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. See + * https://www.boost.org/LICENSE_1_0.txt or see NOTICE. + */ +template +static scalar_t lanczos_sum_expg_scaled(scalar_t x) { + // lanczos approximation + static constexpr scalar_t lanczos_sum_expg_scaled_num[13] = { + 0.006061842346248906525783753964555936883222, + 0.5098416655656676188125178644804694509993, + 19.51992788247617482847860966235652136208, + 449.9445569063168119446858607650988409623, + 6955.999602515376140356310115515198987526, + 75999.29304014542649875303443598909137092, + 601859.6171681098786670226533699352302507, + 3481712.15498064590882071018964774556468, + 14605578.08768506808414169982791359218571, + 43338889.32467613834773723740590533316085, + 86363131.28813859145546927288977868422342, + 103794043.1163445451906271053616070238554, + 56906521.91347156388090791033559122686859 + }; + static constexpr scalar_t lanczos_sum_expg_scaled_denom[13] = { + 1., + 66., + 1925., + 32670., + 357423., + 2637558., + 13339535., + 45995730., + 105258076., + 150917976., + 120543840., + 39916800., + 0. + }; + return ratevl(x, lanczos_sum_expg_scaled_num, + sizeof(lanczos_sum_expg_scaled_num) / sizeof(lanczos_sum_expg_scaled_num[0]) - 1, + lanczos_sum_expg_scaled_denom, + sizeof(lanczos_sum_expg_scaled_denom) / sizeof(lanczos_sum_expg_scaled_denom[0]) - 1); +} + +template +static scalar_t _igam_helper_fac(scalar_t a, scalar_t x) { + // compute x^a * exp(-a) / gamma(a) + // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with + // exp(a - x). + + scalar_t ax, fac, res, num, numfac; + static scalar_t MAXLOG = std::is_same_v ? + 7.09782712893383996843E2 : 88.72283905206835; + static scalar_t EXP1 = 2.718281828459045; + static scalar_t lanczos_g = 6.024680040776729583740234375; + + if (std::fabs(a - x) > 0.4 * std::fabs(a)) { + ax = a * std::log(x) - x - std::lgamma(a); + if (ax < -MAXLOG) { + return 0.0; + } + return std::exp(ax); + } + + fac = a + lanczos_g - 0.5; + res = std::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a); + + if ((a < 200) && (x < 200)) { + res *= std::exp(a - x) * std::pow(x / fac, a); + } + else { + num = x - a - lanczos_g + 0.5; + numfac = num / fac; + res *= std::exp(a * (std::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac); + } + return res; +} + +template +static scalar_t _igam_helper_series(scalar_t a, scalar_t x) { + // Compute igam using DLMF 8.11.4. [igam1] + static scalar_t MACHEP = std::is_same_v ? + 1.11022302462515654042E-16 : 5.9604644775390625E-8; + static int MAXITER = 2000; + + int i; + scalar_t ans, ax, c, r; + + ax = _igam_helper_fac(a, x); + if (ax == 0.0) { + return 0.0; + } + + /* power series */ + r = a; + c = 1.0; + ans = 1.0; + + for (i = 0; i < MAXITER; i++) { + r += 1.0; + c *= x / r; + ans += c; + if (c <= MACHEP * ans) { + break; + } + } + return (ans * ax / a); +} + +template +static scalar_t _igamc_helper_series(scalar_t a, scalar_t x) { + // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in + // _igam_helper_series but extra care is taken to avoid cancellation. + + int n; + scalar_t fac = 1; + scalar_t sum = 0; + scalar_t term, logx; + static scalar_t MAXITER = 2000; + static scalar_t MACHEP = std::is_same_v ? + 1.11022302462515654042E-16 : 5.9604644775390625E-8; + + for (n = 1; n < MAXITER; n++) { + fac *= -x / n; + term = fac / (a + n); + sum += term; + if (std::fabs(term) <= MACHEP * std::fabs(sum)) { + break; + } + } + + logx = std::log(x); + term = -std::expm1(a * logx - std::lgamma(1+a)); + return term - std::exp(a * logx - std::lgamma(a)) * sum; +} + +template +static scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) { + // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1] + static constexpr scalar_t d[25][25] = + {{-3.3333333333333333e-1, 8.3333333333333333e-2, -1.4814814814814815e-2, + 1.1574074074074074e-3, 3.527336860670194e-4, -1.7875514403292181e-4, + 3.9192631785224378e-5, -2.1854485106799922e-6, -1.85406221071516e-6, + 8.296711340953086e-7, -1.7665952736826079e-7, 6.7078535434014986e-9, + 1.0261809784240308e-8, -4.3820360184533532e-9, 9.1476995822367902e-10, + -2.551419399494625e-11, -5.8307721325504251e-11, 2.4361948020667416e-11, + -5.0276692801141756e-12, 1.1004392031956135e-13, 3.3717632624009854e-13, + -1.3923887224181621e-13, 2.8534893807047443e-14, -5.1391118342425726e-16, + -1.9752288294349443e-15}, + {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, + -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, + -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, + 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, + 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, + 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, + 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, + -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, + -4.13125571381061e-15}, + {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, + 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, + -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, + -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, + -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, + 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, + 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, + 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, + 8.8592218725911273e-15}, + {6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4, + 2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7, + 1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6, + -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8, + -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9, + -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14, + -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12, + 6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14, + 2.0453671226782849e-14}, + {-8.618882909167117e-4, 7.8403922172006663e-4, -2.9907248030319018e-4, + -1.4638452578843418e-6, 6.6414982154651222e-5, -3.9683650471794347e-5, + 1.1375726970678419e-5, 2.5074972262375328e-10, -1.6954149536558306e-6, + 8.9075075322053097e-7, -2.2929348340008049e-7, 2.956794137544049e-11, + 2.8865829742708784e-8, -1.4189739437803219e-8, 3.4463580499464897e-9, + -2.3024517174528067e-13, -3.9409233028046405e-10, 1.8602338968504502e-10, + -4.356323005056618e-11, 1.2786001016296231e-15, 4.6792750266579195e-12, + -2.1492464706134829e-12, 4.9088156148096522e-13, -6.3385914848915603e-18, + -5.0453320690800944e-14}, + {-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4, + -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7, + -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6, + -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7, + 4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9, + 3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15, + 9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11, + -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13, + -1.3249659916340829e-13}, + {5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4, + 7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5, + -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6, + -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13, + -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8, + 8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10, + 2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11, + 1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18, + 3.6902800842763467e-13}, + {3.4436760689237767e-4, 5.1717909082605922e-5, -3.3493161081142236e-4, + 2.812695154763237e-4, -1.0976582244684731e-4, -1.2741009095484485e-7, + 2.7744451511563644e-5, -1.8263488805711333e-5, 5.7876949497350524e-6, + 4.9387589339362704e-10, -1.0595367014026043e-6, 6.1667143761104075e-7, + -1.7562973359060462e-7, -1.2974473287015439e-12, 2.695423606288966e-8, + -1.4578352908731271e-8, 3.887645959386175e-9, -3.8810022510194121e-17, + -5.3279941738772867e-10, 2.7437977643314845e-10, -6.9957960920705679e-11, + 2.5899863874868481e-17, 8.8566890996696381e-12, -4.403168815871311e-12, + 1.0865561947091654e-12}, + {-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4, + -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4, + 4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5, + 6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11, + 3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8, + 6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9, + -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10, + -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18, + -3.3721464474854592e-12}, + {-5.9676129019274625e-4, -7.2048954160200106e-5, 6.7823088376673284e-4, + -6.4014752602627585e-4, 2.7750107634328704e-4, 1.8197008380465151e-7, + -8.4795071170685032e-5, 6.105192082501531e-5, -2.1073920183404862e-5, + -8.8585890141255994e-10, 4.5284535953805377e-6, -2.8427815022504408e-6, + 8.7082341778646412e-7, 3.6886101871706965e-12, -1.5344695190702061e-7, + 8.862466778790695e-8, -2.5184812301826817e-8, -1.0225912098215092e-14, + 3.8969470758154777e-9, -2.1267304792235635e-9, 5.7370135528051385e-10, + -1.887749850169741e-19, -8.0931538694657866e-11, 4.2382723283449199e-11, + -1.1002224534207726e-11}, + {1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3, + 9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4, + -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5, + -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11, + -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7, + -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8, + 1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9, + 9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18, + 3.7647749553543836e-11}, + {1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3, + 2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7, + 3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4, + 2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5, + -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6, + -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14, + -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9, + -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10, + 1.3481607129399749e-10}, + {-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3, + -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3, + 8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4, + 1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10, + 1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6, + 7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7, + -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8, + -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20, + -5.0423112718105824e-10}, + {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3, + -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6, + -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4, + -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4, + 4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5, + 6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13, + 3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8, + 8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9, + -1.9661464453856102e-9}, + {1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2, + 7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2, + -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3, + -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10, + -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5, + -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6, + 1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7, + 1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17, + 7.9795091026746235e-9}, + {3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2, + 5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6, + 1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3, + 3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3, + -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4, + -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12, + -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6, + -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7, + 3.3654425209171788e-8}, + {-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1, + -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2, + 4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2, + 1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9, + 1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4, + 1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5, + -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6, + -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16, + -1.4729737374018841e-7}, + {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1, + -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5, + -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2, + -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2, + 5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3, + 1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12, + 8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5, + 3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6, + -6.6812849447625594e-7}, + {7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968, + 1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1, + -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1, + -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8, + -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3, + -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3, + 3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5, + 5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14, + 3.1369106244517615e-6}, + {1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906, + 4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4, + 1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1, + 1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1, + -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2, + -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11, + -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4, + 9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5, + 1.5227271505597605e-5}, + {-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1, + -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1, + 5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816, + 2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7, + 3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1, + 8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2, + -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3, + -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11, + -7.6340103696869031e-5}, + {-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1, + -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3, + -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1, + -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195, + 1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1, + 3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10, + 3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3, + -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3, + -3.9479941246822517e-4}, + {7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2, + 1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2, + -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1, + -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7, + -6.2716159907747034, 5.1168999071852637, -2.0319658112299095, + -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1, + 1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2, + 2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6, + 2.1250180774699461e-3}, + {2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2, + 7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2, + 3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2, + 1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1, + -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373, + -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7, + -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1, + 1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2, + 1.5109265210467774e-2}, + {-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3, + -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3, + 1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2, + 7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6, + 1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1, + -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1, + -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468, + -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1, + 4.8683443692930507e-1}}; + + int k, n, sgn; + int maxpow = 0; + static scalar_t MACHEP = std::is_same_v ? + 1.11022302462515654042E-16 : 5.9604644775390625E-8; + scalar_t lambda = x / a; + scalar_t sigma = (x - a) / a; + scalar_t eta, res, ck, ckterm, term, absterm; + scalar_t absoldterm = INFINITY; + scalar_t etapow[25] = {1}; + scalar_t sum = 0; + scalar_t afac = 1; + + if (igam) { + sgn = -1; + } + else { + sgn = 1; + } + + if (lambda > 1) { + eta = std::sqrt(-2 * (std::log1p(sigma) - sigma)); + } + else if (lambda < 1) { + eta = -std::sqrt(-2 * (std::log1p(sigma) - sigma)); + } + else { + eta = 0; + } + res = 0.5 * std::erfc(sgn * eta * std::sqrt(a / 2)); + + for (k = 0; k < 25; k++) { + ck = d[k][0]; + for (n = 1; n < 25; n++) { + if (n > maxpow) { + etapow[n] = eta * etapow[n-1]; + maxpow += 1; + } + ckterm = d[k][n]*etapow[n]; + ck += ckterm; + if (std::fabs(ckterm) < MACHEP * std::fabs(ck)) { + break; + } + } + term = ck * afac; + absterm = std::fabs(term); + if (absterm > absoldterm) { + break; + } + sum += term; + if (absterm < MACHEP * std::fabs(sum)) { + break; + } + absoldterm = absterm; + afac /= a; + } + res += sgn * std::exp(-0.5 * a * eta * eta) * sum / std::sqrt(2 * c10::pi * a); + + return res; +} + +template +static scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) { + // Compute igamc using DLMF 8.9.2. [igam1] + int i; + scalar_t ans, ax, c, yc, r, t, y, z; + scalar_t pk, pkm1, pkm2, qk, qkm1, qkm2; + int MAXITER = 2000; + static scalar_t MACHEP = std::is_same_v ? + 1.11022302462515654042E-16 : 5.9604644775390625E-8; + static scalar_t BIG = std::is_same_v ? + 4.503599627370496e15 : 16777216.; + static scalar_t BIGINV = std::is_same_v ? + 2.22044604925031308085e-16 : 5.9604644775390625E-8; + + ax = _igam_helper_fac(a, x); + if (ax == 0.0) { + return 0.0; + } + + /* continued fraction */ + y = 1.0 - a; + z = x + y + 1.0; + c = 0.0; + pkm2 = 1.0; + qkm2 = x; + pkm1 = x + 1.0; + qkm1 = z * x; + ans = pkm1 / qkm1; + + for (i = 0; i < MAXITER; i++) { + c += 1.0; + y += 1.0; + z += 2.0; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != 0) { + r = pk / qk; + t = std::fabs((ans - r) / r); + ans = r; + } + else { + t = 1.0; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (std::fabs(pk) > BIG) { + pkm2 *= BIGINV; + pkm1 *= BIGINV; + qkm2 *= BIGINV; + qkm1 *= BIGINV; + } + if (t <= MACHEP) { + break; + } + } + return ans * ax; +} + +template +inline scalar_t calc_igammac(scalar_t a, scalar_t x) { + /* the calculation of the regularized upper incomplete gamma function + * is done differently based on the values of a and x: + * - if x and/or a is at the boundary of defined region, then assign the + * result at the boundary + * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for + * Large Parameter (see DLMF 8.12.4 [igam1]) + * - if x > 1.1 and x < a, using the subtraction from the regularized lower + * incomplete gamma + * - otherwise, calculate the series from [igam2] eq (5) + */ + scalar_t absxma_a; + + static scalar_t SMALL = 20.0; + static scalar_t LARGE = 200.0; + static scalar_t SMALLRATIO = 0.3; + static scalar_t LARGERATIO = 4.5; + + // note that in SciPy, a and x are non-negative, with exclusive 0s (i.e., + // at most 1 of them can be 0), where igammac(0, x) = 0.0 iff x > 0. + if ((x < 0) || (a < 0)) { + // out of defined-region of the function + return std::numeric_limits::quiet_NaN(); + } + else if (a == 0) { + if (x > 0) { + return 0.0; + } + else { + return std::numeric_limits::quiet_NaN(); + } + } + else if (x == 0) { + return 1.0; + } + else if (std::isinf(a)) { + if (std::isinf(x)) { + return std::numeric_limits::quiet_NaN(); + } + return 1.0; + } + else if (std::isinf(x)) { + return 0.0; + } + + absxma_a = std::fabs(x - a) / a; + if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) { + return _igam_helper_asymptotic_series(a, x, 0); + } + else if ((a > LARGE) && (absxma_a < LARGERATIO / std::sqrt(a))) { + return _igam_helper_asymptotic_series(a, x, 0); + } + + if (x > 1.1) { + if (x < a) { + return 1.0 - _igam_helper_series(a, x); + } + else { + return _igamc_helper_continued_fraction(a, x); + } + } + else if (x <= 0.5) { + if (-0.4 / std::log(x) < a) { + return 1.0 - _igam_helper_series(a, x); + } + else { + return _igamc_helper_series(a, x); + } + } + else { + if (x * 1.1 < a) { + return 1.0 - _igam_helper_series(a, x); + } + else { + return _igamc_helper_series(a, x); + } + } +} + +template +scalar_t calc_igamma(scalar_t a, scalar_t x) { + /* the calculation of the regularized lower incomplete gamma function + * is done differently based on the values of a and x: + * - if x and/or a is at the boundary of defined region, then assign the + * result at the boundary + * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for + * Large Parameter (see DLMF 8.12.3 [igam1]) + * - if x > 1 and x > a, using the subtraction from the regularized upper + * incomplete gamma + * - otherwise, calculate the series from [igam2] eq (4) + */ + scalar_t absxma_a; + static scalar_t SMALL = 20.0; + static scalar_t LARGE = 200.0; + static scalar_t SMALLRATIO = 0.3; + static scalar_t LARGERATIO = 4.5; + + // boundary values following SciPy + // note that in SciPy, a and x are non-negative, with exclusive 0s (i.e., + // at most 1 of them can be 0), where igamma(0, x) = 1.0 iff x > 0. + if ((x < 0) || (a < 0)) { + // out of defined-region of the function + return std::numeric_limits::quiet_NaN(); + } + else if (a == 0) { + if (x > 0) { + return 1.0; + } + else { + return std::numeric_limits::quiet_NaN(); + } + } + else if (x == 0) { + return 0.0; // zero integration limit + } + else if (std::isinf(a)) { + if (std::isinf(x)) { + return std::numeric_limits::quiet_NaN(); + } + return 0.0; + } + else if (std::isinf(x)) { + return 1.0; + } + + /* Asymptotic regime where a ~ x. See [igam2] */ + absxma_a = std::fabs(x - a) / a; + if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) { + return _igam_helper_asymptotic_series(a, x, 1); + } + else if ((a > LARGE) && (absxma_a < LARGERATIO / std::sqrt(a))) { + return _igam_helper_asymptotic_series(a, x, 1); + } + + if ((x > 1.0) && (x > a)) { + return 1.0 - calc_igammac(a, x); + } + + return _igam_helper_series(a, x); +} + +template <> +[[maybe_unused]] inline c10::BFloat16 calc_igamma( + c10::BFloat16 a, + c10::BFloat16 x) { + return calc_igamma(float(a), float(x)); +} + +template <> +[[maybe_unused]] inline c10::Half calc_igamma( + c10::Half a, + c10::Half x) { + return calc_igamma(float(a), float(x)); +} + +template <> +[[maybe_unused]] inline c10::BFloat16 calc_igammac( + c10::BFloat16 a, + c10::BFloat16 x) { + return calc_igammac(float(a), float(x)); +} + +template <> +[[maybe_unused]] inline c10::Half calc_igammac( + c10::Half a, + c10::Half x) { + return calc_igammac(float(a), float(x)); +} + +inline c10::BFloat16 calc_erfinv(c10::BFloat16 a) { return calc_erfinv(float(a)); } + +template +inline T abs_impl(T v) { + return std::abs(v); +} + +template <> +[[maybe_unused]] inline uint8_t abs_impl(uint8_t v) { + return v; +} + +template +inline typename std::enable_if_t, T> +calc_gcd(T a, T b) { + a = abs_impl(a); + b = abs_impl(b); + while (a != 0) { + T c = a; + a = b % a; + b = c; + } + return b; +} + +template +C10_HOST_DEVICE T exp2_impl(T x) { + return std::exp2(x); +} + +template +C10_HOST_DEVICE c10::complex exp2_impl(c10::complex x) { + // There is no std::exp2 overload for complex, so instead + // use the identity 2^x = e^(ln(2) * x) + constexpr auto ln2 = c10::ln_2; + return std::exp(ln2 * x); +} + +/* + * This function is derived from the implementation of the chbevl function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Evaluates the series + * + * len-1 + * - ' + * y = > array[i] T (x/2) + * - i + * i=0 + * + * of Chebyshev polynomials Ti at argument x/2. + * + * Coefficients are stored in reverse order, i.e. the zero order term is last in the array. Note len is the number of + * coefficients, not the order. + * + * If coefficients are for the interval a to b, x must have been transformed to x -> 2(2x - b - a)/(b-a) before + * entering the routine. This maps x from (a, b) to (-1, 1), over which the Chebyshev polynomials are defined. + * + * If the coefficients are for the inverted interval, in which (a, b) is mapped to (1/b, 1/a), the transformation + * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, this becomes x -> 4a/x - 1. + */ +template +inline typename std::enable_if_t, T> +chbevl(const T x, const T array[], size_t len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = static_cast(0.0); + + for (size_t i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return (static_cast(0.5) * (b0 - b2)); +} + +/* + * This function is derived from the implementation of the i0 function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Computes an approximation of the zeroth order modified Bessel function of the first kind. + * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion. + * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value + * of all inputs to convert them into the domain of the approximation. + */ +template +inline std::tuple chebyshev_coefficients_i0e_A() { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T coeff[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + return std::make_tuple(coeff, 30); +} + +template +inline std::tuple chebyshev_coefficients_i0e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + static const T coeff[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return std::make_tuple(coeff, 25); +} + +template +inline typename std::enable_if_t, std::tuple> +chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coeff[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + return std::make_tuple(coeff, 29); +} + +template +inline typename std::enable_if_t, std::tuple> +chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coeff[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + return std::make_tuple(coeff, 17); +} + +template +inline typename std::enable_if_t, std::tuple> +chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coeff[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + return std::make_tuple(coeff, 25); +} + +template +inline typename std::enable_if_t, std::tuple> +chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coeff[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + return std::make_tuple(coeff, 7); +} + +template +inline typename std::enable_if_t, T> +calc_i0(T _x) { + T x = std::abs(_x); + + if (x <= T{8.0}) { + auto [A, len] = chebyshev_coefficients_i0e_A(); + T y = (x / T{2.0}) - T{2.0}; + return static_cast(std::exp(x) * chbevl(y, A, len)); + } + auto [B, len] = chebyshev_coefficients_i0e_B(); + return std::exp(x) * chbevl(T{32.0} / x - T{2.0}, B, len) / std::sqrt(x); +} + +// Upcast bfloat16/half input to float for numerical accuracy purposes +inline c10::BFloat16 calc_i0(c10::BFloat16 a) { return calc_i0(static_cast(a)); } +inline c10::Half calc_i0(c10::Half a) { return calc_i0(static_cast(a)); } + +/* + * This function is derived from the implementation of the i1 function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Computes an approximation of the first order modified Bessel function of the first kind. + * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion. + * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value + * of all inputs to convert them into the domain of the approximation. + */ +template +inline typename std::enable_if_t, T> +calc_i1(T _x) { + T x = std::abs(_x); + + if (x <= T{8.0}) { + auto [A, len] = chebyshev_coefficients_i1e_A(); + T y = (x / T{2.0}) - T{2.0}; + const T out = std::exp(x) * x * chbevl(y, A, len); + return (_x < T{0.0}) ? -out : out; + } + auto [B, len] = chebyshev_coefficients_i1e_B(); + const T out = (std::exp(x) * chbevl(T{32.0} / x - T{2.0}, B, len)) / std::sqrt(x); + return (_x < T{0.0}) ? -out : out; +} + +// Upcast bfloat16/half input to float for numerical accuracy purposes +inline c10::BFloat16 calc_i1(c10::BFloat16 a) { return calc_i1(static_cast(a)); } +inline c10::Half calc_i1(c10::Half a) { return calc_i1(static_cast(a)); } + + +/* + * This function is derived from the implementation of the i1e function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Computes an approximation of the exponentially scaled first order modified Bessel function of the first kind. + * The approximation is actually two (sub)approximations, both using a Chebyshev polynomial expansion. + * One approximates the function over [0, 8], and the other over (8, infinity). This function takes the absolute value + * of all inputs to convert them into the domain of the approximation. + */ +template +inline typename std::enable_if_t, T> +calc_i1e(T _x) { + T x = std::abs(_x); + + if (x <= T{8.0}) { + auto [A, len] = chebyshev_coefficients_i1e_A(); + T y = (x / T{2.0}) - T{2.0}; + const T out = chbevl(y, A, len) * x; + return (_x < T{0.0}) ? -out : out; + } + auto [B, len] = chebyshev_coefficients_i1e_B(); + const auto out = chbevl(T{32.0} / x - T{2.0}, B, len) / std::sqrt(x); + return (_x < T{0.0}) ? -out : out; +} + +// Upcast bfloat16/half input to float for numerical accuracy purposes +inline c10::BFloat16 calc_i1e(c10::BFloat16 a) { return calc_i1e(static_cast(a)); } +inline c10::Half calc_i1e(c10::Half a) { return calc_i1e(static_cast(a)); } + + +/* + * This function is derived from the implementation of the i1e function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Computes the argument, x, for which the area under the Gaussian probability density function + * (integrated from minus infinity to x) is equal to y. + */ +template +inline C10_HOST_DEVICE T calc_ndtri(T y0) { + + /* sqrt(2pi) */ + constexpr T s2pi = 2.50662827463100050242E0; + constexpr T one = 1; + constexpr T zero = 0; + + /* approximation for 0 <= |y - 0.5| <= 3/8 */ + static const T P0[5] = { + -5.99633501014107895267E1, + 9.80010754185999661536E1, + -5.66762857469070293439E1, + 1.39312609387279679503E1, + -1.23916583867381258016E0, + }; + + static const T Q0[9] = { + 1.00000000000000000000E0, + 1.95448858338141759834E0, + 4.67627912898881538453E0, + 8.63602421390890590575E1, + -2.25462687854119370527E2, + 2.00260212380060660359E2, + -8.20372256168333339912E1, + 1.59056225126211695515E1, + -1.18331621121330003142E0, + }; + + /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8 + * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14. + */ + static const T P1[9] = { + 4.05544892305962419923E0, + 3.15251094599893866154E1, + 5.71628192246421288162E1, + 4.40805073893200834700E1, + 1.46849561928858024014E1, + 2.18663306850790267539E0, + -1.40256079171354495875E-1, + -3.50424626827848203418E-2, + -8.57456785154685413611E-4, + }; + + static const T Q1[9] = { + 1.00000000000000000000E0, + 1.57799883256466749731E1, + 4.53907635128879210584E1, + 4.13172038254672030440E1, + 1.50425385692907503408E1, + 2.50464946208309415979E0, + -1.42182922854787788574E-1, + -3.80806407691578277194E-2, + -9.33259480895457427372E-4, + }; + + /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64 + * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890. + */ + + static const T P2[9] = { + 3.23774891776946035970E0, + 6.91522889068984211695E0, + 3.93881025292474443415E0, + 1.33303460815807542389E0, + 2.01485389549179081538E-1, + 1.23716634817820021358E-2, + 3.01581553508235416007E-4, + 2.65806974686737550832E-6, + 6.23974539184983293730E-9, + }; + + static const T Q2[9] = { + 1.00000000000000000000E0, + 6.02427039364742014255E0, + 3.67983563856160859403E0, + 1.37702099489081330271E0, + 2.16236993594496635890E-1, + 1.34204006088543189037E-2, + 3.28014464682127739104E-4, + 2.89247864745380683936E-6, + 6.79019408009981274425E-9, + }; + + if (y0 == zero) { + return -std::numeric_limits::infinity(); + } + if (y0 == one) { + return std::numeric_limits::infinity(); + } + if (y0 < zero || y0 > one) { + return std::numeric_limits::quiet_NaN(); + } + bool code = true; + T y = y0; + if (y > one - T{0.13533528323661269189}) { /* 0.135... = exp(-2) */ + y = one - y; + code = false; + } + + if (y > T{0.13533528323661269189}) { + y = y - T{0.5}; + const T y2 = y * y; + T x = y + y * (y2 * polevl(y2, P0, 4) / polevl(y2, Q0, 8)); + return (x * s2pi); + } + + T x = ::sqrt(T{-2.0} * ::log(y)); + const T x0 = x - ::log(x) / x; + + const T z = one / x; + T x1; + if (x < T{8.0}) /* y > exp(-32) = 1.2664165549e-14 */ + { + x1 = z * polevl(z, P1, 8) / polevl(z, Q1, 8); + } else { + x1 = z * polevl(z, P2, 8) / polevl(z, Q2, 8); + } + x = x0 - x1; + if (code) { + x = -x; + } + return x; +} + +/* The next function is taken from http://ab-initio.mit.edu/faddeeva */ + +/* Copyright (c) 2012 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by + Steven G. Johnson, October 2012. + + This function combines a few different ideas. + + First, for x > 50, it uses a continued-fraction expansion (same as + for the Faddeeva function, but with algebraic simplifications for z=i*x). + + Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations, + but with two twists: + + a) It maps x to y = 4 / (4+x) in [0,1]. This simple transformation, + inspired by a similar transformation in the octave-forge/specfun + erfcx by Soren Hauberg, results in much faster Chebyshev convergence + than other simple transformations I have examined. + + b) Instead of using a single Chebyshev polynomial for the entire + [0,1] y interval, we break the interval up into 100 equal + subintervals, with a switch/lookup table, and use much lower + degree Chebyshev polynomials in each subinterval. This greatly + improves performance in my tests. + + For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x), + with the usual checks for overflow etcetera. + + Performance-wise, it seems to be substantially faster than either + the SLATEC DERFC function [or an erfcx function derived there from] + or Cody's CALERF function (from netlib.org/specfun), while + retaining near machine precision in accuracy. */ + +/* Given y100=100*y, where y = 4/(4+x) for x >= 0, compute erfc(x). + + Uses a look-up table of 100 different Chebyshev polynomials + for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated + with the help of Maple and a little shell script. This allows + the Chebyshev polynomials to be of significantly lower degree (about 1/4) + compared to fitting the whole [0,1] interval with a single polynomial. */ + + +template +C10_HOST_DEVICE inline typename std::enable_if_t, T> +erfcx_y100(T y100) +{ + switch (static_cast(y100)) { +case 0: { +T t = 2*y100 - 1; +return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t; +} +case 1: { +T t = 2*y100 - 3; +return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t; +} +case 2: { +T t = 2*y100 - 5; +return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t; +} +case 3: { +T t = 2*y100 - 7; +return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t; +} +case 4: { +T t = 2*y100 - 9; +return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t; +} +case 5: { +T t = 2*y100 - 11; +return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t; +} +case 6: { +T t = 2*y100 - 13; +return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t; +} +case 7: { +T t = 2*y100 - 15; +return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t; +} +case 8: { +T t = 2*y100 - 17; +return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t; +} +case 9: { +T t = 2*y100 - 19; +return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t; +} +case 10: { +T t = 2*y100 - 21; +return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t; +} +case 11: { +T t = 2*y100 - 23; +return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t; +} +case 12: { +T t = 2*y100 - 25; +return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t; +} +case 13: { +T t = 2*y100 - 27; +return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t; +} +case 14: { +T t = 2*y100 - 29; +return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t; +} +case 15: { +T t = 2*y100 - 31; +return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t; +} +case 16: { +T t = 2*y100 - 33; +return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t; +} +case 17: { +T t = 2*y100 - 35; +return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t; +} +case 18: { +T t = 2*y100 - 37; +return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t; +} +case 19: { +T t = 2*y100 - 39; +return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t; +} +case 20: { +T t = 2*y100 - 41; +return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t; +} +case 21: { +T t = 2*y100 - 43; +return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t; +} +case 22: { +T t = 2*y100 - 45; +return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t; +} +case 23: { +T t = 2*y100 - 47; +return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t; +} +case 24: { +T t = 2*y100 - 49; +return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t; +} +case 25: { +T t = 2*y100 - 51; +return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t; +} +case 26: { +T t = 2*y100 - 53; +return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t; +} +case 27: { +T t = 2*y100 - 55; +return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t; +} +case 28: { +T t = 2*y100 - 57; +return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t; +} +case 29: { +T t = 2*y100 - 59; +return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t; +} +case 30: { +T t = 2*y100 - 61; +return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t; +} +case 31: { +T t = 2*y100 - 63; +return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t; +} +case 32: { +T t = 2*y100 - 65; +return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t; +} +case 33: { +T t = 2*y100 - 67; +return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t; +} +case 34: { +T t = 2*y100 - 69; +return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t; +} +case 35: { +T t = 2*y100 - 71; +return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t; +} +case 36: { +T t = 2*y100 - 73; +return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t; +} +case 37: { +T t = 2*y100 - 75; +return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t; +} +case 38: { +T t = 2*y100 - 77; +return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t; +} +case 39: { +T t = 2*y100 - 79; +return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t; +} +case 40: { +T t = 2*y100 - 81; +return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t; +} +case 41: { +T t = 2*y100 - 83; +return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t; +} +case 42: { +T t = 2*y100 - 85; +return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t; +} +case 43: { +T t = 2*y100 - 87; +return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t; +} +case 44: { +T t = 2*y100 - 89; +return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t; +} +case 45: { +T t = 2*y100 - 91; +return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t; +} +case 46: { +T t = 2*y100 - 93; +return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t; +} +case 47: { +T t = 2*y100 - 95; +return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t; +} +case 48: { +T t = 2*y100 - 97; +return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t; +} +case 49: { +T t = 2*y100 - 99; +return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t; +} +case 50: { +T t = 2*y100 - 101; +return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t; +} +case 51: { +T t = 2*y100 - 103; +return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t; +} +case 52: { +T t = 2*y100 - 105; +return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t; +} +case 53: { +T t = 2*y100 - 107; +return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t; +} +case 54: { +T t = 2*y100 - 109; +return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t; +} +case 55: { +T t = 2*y100 - 111; +return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t; +} +case 56: { +T t = 2*y100 - 113; +return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t; +} +case 57: { +T t = 2*y100 - 115; +return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t; +} +case 58: { +T t = 2*y100 - 117; +return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t; +} +case 59: { +T t = 2*y100 - 119; +return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t; +} +case 60: { +T t = 2*y100 - 121; +return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t; +} +case 61: { +T t = 2*y100 - 123; +return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t; +} +case 62: { +T t = 2*y100 - 125; +return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t; +} +case 63: { +T t = 2*y100 - 127; +return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t; +} +case 64: { +T t = 2*y100 - 129; +return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t; +} +case 65: { +T t = 2*y100 - 131; +return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t; +} +case 66: { +T t = 2*y100 - 133; +return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t; +} +case 67: { +T t = 2*y100 - 135; +return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t; +} +case 68: { +T t = 2*y100 - 137; +return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t; +} +case 69: { +T t = 2*y100 - 139; +return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t; +} +case 70: { +T t = 2*y100 - 141; +return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t; +} +case 71: { +T t = 2*y100 - 143; +return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t; +} +case 72: { +T t = 2*y100 - 145; +return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t; +} +case 73: { +T t = 2*y100 - 147; +return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t; +} +case 74: { +T t = 2*y100 - 149; +return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t; +} +case 75: { +T t = 2*y100 - 151; +return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t; +} +case 76: { +T t = 2*y100 - 153; +return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t; +} +case 77: { +T t = 2*y100 - 155; +return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t; +} +case 78: { +T t = 2*y100 - 157; +return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t; +} +case 79: { +T t = 2*y100 - 159; +return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t; +} +case 80: { +T t = 2*y100 - 161; +return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t; +} +case 81: { +T t = 2*y100 - 163; +return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t; +} +case 82: { +T t = 2*y100 - 165; +return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t; +} +case 83: { +T t = 2*y100 - 167; +return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t; +} +case 84: { +T t = 2*y100 - 169; +return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t; +} +case 85: { +T t = 2*y100 - 171; +return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t; +} +case 86: { +T t = 2*y100 - 173; +return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t; +} +case 87: { +T t = 2*y100 - 175; +return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t; +} +case 88: { +T t = 2*y100 - 177; +return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t; +} +case 89: { +T t = 2*y100 - 179; +return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t; +} +case 90: { +T t = 2*y100 - 181; +return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t; +} +case 91: { +T t = 2*y100 - 183; +return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t; +} +case 92: { +T t = 2*y100 - 185; +return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t; +} +case 93: { +T t = 2*y100 - 187; +return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t; +} +case 94: { +T t = 2*y100 - 189; +return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t; +} +case 95: { +T t = 2*y100 - 191; +return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t; +} +case 96: { +T t = 2*y100 - 193; +return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t; +} +case 97: { +T t = 2*y100 - 195; +return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t; +} +case 98: { +T t = 2*y100 - 197; +return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t; +} +case 99: { +T t = 2*y100 - 199; +return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t; +} + } + // we only get here if y = 1, i.e. |x| < 4*eps, in which case + // erfcx is within 1e-15 of 1.. + return 1.0; +} + +template +C10_HOST_DEVICE inline typename std::enable_if_t, T> +calc_erfcx(T x) +{ + if (at::_isnan(x)) { + return x; + } + + if (x >= 0) { + if (x > 50) { // continued-fraction expansion is faster + const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi) + if (x > 5e7) { // 1-term expansion, important to avoid overflow + return ispi / x; + } + /* 5-term expansion (rely on compiler for CSE), simplified from: + ispi / (x+0.5/(x+1/(x+1.5/(x+2/x)))) */ + return ispi*((x*x) * (x*x+4.5) + 2) / (x * ((x*x) * (x*x+5) + 3.75)); + } + return erfcx_y100(400/(4+x)); + } + else { + if (x < -26.7) { + return std::numeric_limits::infinity(); + } + else if (x < -6.1) { + return 2*exp(x*x); + } + else { + return 2*exp(x*x) - erfcx_y100(400/(4-x)); + } + } +} + +/* + * Logarithm of Gaussian cumulative distribution function. + + * This implementation of log_ndtr and its helper functions + * follow SciPy's implementation + * See NOTICE for the licenses. + */ +template +inline C10_HOST_DEVICE T calc_log_ndtr(T x) { + T t = x * c10::frac_sqrt_2; + if (x < T{-1.0}) { + return std::log(calc_erfcx(-t) / 2) - t * t; + } else { + return std::log1p(-std::erfc(t) / 2); + } +} + +template +inline C10_HOST_DEVICE T airy_ai_forward(T x) { + static const T AN[] = { + +3.46538101525629032477e-01, + +1.20075952739645805542e+01, + +7.62796053615234516538e+01, + +1.68089224934630576269e+02, + +1.59756391350164413639e+02, + +7.05360906840444183113e+01, + +1.40264691163389668864e+01, + +9.99999999999999995305e-01, + }; + + static const T AD[] = { + +5.67594532638770212846e-01, + +1.47562562584847203173e+01, + +8.45138970141474626562e+01, + +1.77318088145400459522e+02, + +1.64234692871529701831e+02, + +7.14778400825575695274e+01, + +1.40959135607834029598e+01, + +1.00000000000000000470e+00, + }; + + static const T AFN[] = { + -1.31696323418331795333e-01, + -6.26456544431912369773e-01, + -6.93158036036933542233e-01, + -2.79779981545119124951e-01, + -4.91900132609500318020e-02, + -4.06265923594885404393e-03, + -1.59276496239262096340e-04, + -2.77649108155232920844e-06, + -1.67787698489114633780e-08, + }; + + static const T AFD[] = { + +1.33560420706553243746e+01, + +3.26825032795224613948e+01, + +2.67367040941499554804e+01, + +9.18707402907259625840e+00, + +1.47529146771666414581e+00, + +1.15687173795188044134e-01, + +4.40291641615211203805e-03, + +7.54720348287414296618e-05, + +4.51850092970580378464e-07, + }; + + static const T AGN[] = { + +1.97339932091685679179e-02, + +3.91103029615688277255e-01, + +1.06579897599595591108e+00, + +9.39169229816650230044e-01, + +3.51465656105547619242e-01, + +6.33888919628925490927e-02, + +5.85804113048388458567e-03, + +2.82851600836737019778e-04, + +6.98793669997260967291e-06, + +8.11789239554389293311e-08, + +3.41551784765923618484e-10, + }; + + static const T AGD[] = { + +9.30892908077441974853e+00, + +1.98352928718312140417e+01, + +1.55646628932864612953e+01, + +5.47686069422975497931e+00, + +9.54293611618961883998e-01, + +8.64580826352392193095e-02, + +4.12656523824222607191e-03, + +1.01259085116509135510e-04, + +1.17166733214413521882e-06, + +4.91834570062930015649e-09, + }; + + int domain_flag = 0; + + T ai; + + if (std::isinf(x)) { + return std::numeric_limits::quiet_NaN(); + } + + if (x > T(103.892)) { + return T(0.0); + } + + T f; + T g; + T k; + + if (x < T(-2.09)) { + T z = T(1.0) / (T(-2.0) * x * std::sqrt(-x) / T(3.0)); + + T afn = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afn = afn * (z * z) + AFN[index]; + } + + T afd = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afd = afd * (z * z) + AFD[index]; + } + + T agn = 0.0; + + for (uint8_t index = 0; index <= 10 + 0; index++) { + agn = agn * (z * z) + AGN[index]; + } + + T agd = 0.0; + + for (uint8_t index = 0; index <= 10 - 1; index++) { + agd = agd * (z * z) + AGD[index]; + } + + T t = T(-2.0) * x * std::sqrt(-x) / T(3.0) + T(0.25) * c10::pi; + + return T(5.64189583547756286948e-01) / std::sqrt(std::sqrt(-x)) * (std::sin(t) * (T(1.0) + z * z * afn / afd) - std::cos(t) * (z * agn / agd)); + } + + if (x >= T(2.09)) { + domain_flag = 5; + + T zeta = T(2.0) * x * std::sqrt(x) / T(3.0); + + T an = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + an = an * (T(1.0) / zeta) + AN[index]; + } + + T ad = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + ad = ad * (T(1.0) / zeta) + AD[index]; + } + + ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * std::sqrt(std::sqrt(x)) * std::exp(zeta)); + + if (x > T(8.3203353)) { + return ai; + } + } + + f = 1.0; + g = x; + k = 1.0; + + T m = 1.0; + T n = x; + T t = 1.0; + T z = x * x * x; + + while (t > std::numeric_limits::epsilon()) { + m *= z; + k += T(1.0); + m /= k; + n *= z; + k += T(1.0); + n /= k; + m /= k; + f += m; + k += T(1.0); + n /= k; + g += n; + + t = std::abs(m / f); + } + + if ((domain_flag & 1) == 0) { + return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g; + } + + return ai; +} // T airy_ai(T x) + +template +inline C10_HOST_DEVICE T bessel_j0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T RP[] = { + -4.79443220978201773821e+09, + +1.95617491946556577543e+12, + -2.49248344360967716204e+14, + +9.70862251047306323952e+15, + }; + + static const T RQ[] = { + +4.99563147152651017219e+02, + +1.73785401676374683123e+05, + +4.84409658339962045305e+07, + +1.11855537045356834862e+10, + +2.11277520115489217587e+12, + +3.10518229857422583814e+14, + +3.18121955943204943306e+16, + +1.71086294081043136091e+18, + }; + + if (x < T(0)) { + x = -x; + } + + if (x <= T(5.0)) { + if (x < T(0.00001)) { + return T(1.0) - x * x / T(4.0); + } + + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq; + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * std::cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * std::sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / std::sqrt(x); +} // bessel_j0_forward(T x) + +template +inline C10_HOST_DEVICE T bessel_j1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T RP[] = { + -8.99971225705559398224e+08, + +4.52228297998194034323e+11, + -7.27494245221818276015e+13, + +3.68295732863852883286e+15, + }; + + static const T RQ[] = { + +6.20836478118054335476e+02, + +2.56987256757748830383e+05, + +8.35146791431949253037e+07, + +2.21511595479792499675e+10, + +4.74914122079991414898e+12, + +7.84369607876235854894e+14, + +8.95222336184627338078e+16, + +5.32278620332680085395e+18, + }; + + if (x < T(0.0)) { + return -bessel_j1_forward(-x); + } + + if (x <= T(5.0)) { + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * std::cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * std::sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / std::sqrt(x); +} // bessel_j1_forward(T x) + +template +inline C10_HOST_DEVICE T bessel_y0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T YP[] = { + +1.55924367855235737965e+04, + -1.46639295903971606143e+07, + +5.43526477051876500413e+09, + -9.82136065717911466409e+11, + +8.75906394395366999549e+13, + -3.46628303384729719441e+15, + +4.42733268572569800351e+16, + -1.84950800436986690637e+16, + }; + + static const T YQ[] = { + +1.04128353664259848412e+03, + +6.26107330137134956842e+05, + +2.68919633393814121987e+08, + +8.64002487103935000337e+10, + +2.02979612750105546709e+13, + +3.17157752842975028269e+15, + +2.50596256172653059228e+17, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return -std::numeric_limits::infinity(); + } + + if (x < T(0.0)) { + return std::numeric_limits::quiet_NaN(); + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return yp / yq + (T(0.636619772367581343075535053490057448) * std::log(x) * bessel_j0_forward(x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * std::sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * std::cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / std::sqrt(x); +} // bessel_y0_forward(T x) + +template +inline C10_HOST_DEVICE T bessel_y1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T YP[] = { + +1.26320474790178026440e+09, + -6.47355876379160291031e+11, + +1.14509511541823727583e+14, + -8.12770255501325109621e+15, + +2.02439475713594898196e+17, + -7.78877196265950026825e+17, + }; + + static const T YQ[] = { + +5.94301592346128195359e+02, + +2.35564092943068577943e+05, + +7.34811944459721705660e+07, + +1.87601316108706159478e+10, + +3.88231277496238566008e+12, + +6.20557727146953693363e+14, + +6.87141087355300489866e+16, + +3.97270608116560655612e+18, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return -std::numeric_limits::infinity(); + } + + if (x <= T(0.0)) { + return std::numeric_limits::quiet_NaN(); + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 5; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * std::log(x) - T(1.0) / x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * std::sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * std::cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / std::sqrt(x); +} // bessel_y1_forward(T x) + +template +inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (std::abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (std::abs(x) < T(1.0))) { + return std::cos(n * std::acos(x)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; +} // chebyshev_polynomial_t_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, T n) { + return chebyshev_polynomial_t_forward(x, static_cast(n)); +} // chebyshev_polynomial_t_forward(T x, T n) + +template +inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (std::abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 8) && (std::abs(x) < T(1.0))) { + if (std::sin(std::acos(x)) != T(0.0)) { + return std::sin((n + 1) * std::acos(x)) / std::sin(std::acos(x)); + } + + return (n + 1) * std::cos((n + 1) * std::acos(x)) / x; + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + T p = T(1.0); + T q = x + x; + T r; + + for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; +} // chebyshev_polynomial_u_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, T n) { + return chebyshev_polynomial_u_forward(x, static_cast(n)); +} // chebyshev_polynomial_u_forward(T x, T n) + +template +inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (std::abs(x) == T(1.0)) { + if (x > T(0.0)) { + return T(1.0); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if ((n > 8) && (std::abs(x) < T(1.0))) { + if (std::sin(std::acos(x) / T(2.0)) != T(1.0)) { + return std::cos((n + T(0.5)) * std::acos(x)) / std::cos(std::acos(x) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; +} // chebyshev_polynomial_v_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, T n) { + return chebyshev_polynomial_v_forward(x, static_cast(n)); +} // chebyshev_polynomial_v_forward(T x, T n) + +template +inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (std::abs(x) == T(1.0)) { + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 8) && (std::abs(x) < T(1.0))) { + if (std::cos(std::acos(x) / T(2.0)) != T(1.0)) { + return std::sin((n + T(0.5)) * std::acos(x)) / std::sin(std::acos(x) / T(2.0)); + } + + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x + T(1.0); + } + + T p = T(1.0); + T q = x + x + T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; +} // chebyshev_polynomial_w_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, T n) { + return chebyshev_polynomial_w_forward(x, static_cast(n)); +} // chebyshev_polynomial_w_forward(T x, T n) + +template +constexpr auto getHermitianLimit() { + if constexpr (std::is_same_v) { + return 128; + } else if constexpr (std::is_same_v) { + return 512; + } else { + return 1024; + } +} + +template +inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + if (n > getHermitianLimit()) { + return std::numeric_limits::quiet_NaN(); + } + + T p = T(1.0); + T q = x + x; + T r = T(0.0); + + for (int64_t k = 2; k < n + n; k += 2) { + r = (x + x) * q - k * p; + p = q; + q = r; + } + + return r; +} // hermite_polynomial_h_forward(T x, int64_t n) + +template, int> = 0> +inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, T n) { + return hermite_polynomial_h_forward(x, static_cast(n)); +} // hermite_polynomial_h_forward(T x, T n) + +template, int> = 0> +__ubsan_ignore_float_cast_overflow__ inline C10_HOST_DEVICE T hermite_polynomial_h_forward(T x, T n) { + return hermite_polynomial_h_forward(x, (!std::isinf(n) && !std::isnan(n)) ? static_cast(n) : static_cast(-1)); +} // hermite_polynomial_h_forward(T x, T n) + +template +inline C10_HOST_DEVICE T hermite_polynomial_he_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + if (n > getHermitianLimit()) { + return std::numeric_limits::quiet_NaN(); + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; k < n; k++) { + r = x * q - k * p; + p = q; + q = r; + } + + return r; +} // hermite_polynomial_he_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T hermite_polynomial_he_forward(T x, T n) { + return hermite_polynomial_he_forward(x, static_cast(n)); +} // hermite_polynomial_he_forward(T x, T n) + +template +inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (std::abs(x) == T(0.0)) { + return T(1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return T(1.0) - x; + } + + T p = T(1.0); + T q = T(1.0) - x; + T r; + + for (int64_t k = 1; (k < n) && !std::isnan(q); k++) { + r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; +} // laguerre_polynomial_l_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, T n) { + return laguerre_polynomial_l_forward(x, static_cast(n)); +} // laguerre_polynomial_l_forward(T x, T n) + +template +inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (std::abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; (k < n) && !std::isnan(q); k++) { + r = ((k + k + 1) * x * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; +} // legendre_polynomial_p_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, T n) { + return legendre_polynomial_p_forward(x, static_cast(n)); +} // legendre_polynomial_p_forward(T x, T n) + +template +inline C10_HOST_DEVICE T modified_bessel_i0_forward(T x) { + static const T A[] = { + -4.41534164647933937950e-18, + +3.33079451882223809783e-17, + -2.43127984654795469359e-16, + +1.71539128555513303061e-15, + -1.16853328779934516808e-14, + +7.67618549860493561688e-14, + -4.85644678311192946090e-13, + +2.95505266312963983461e-12, + -1.72682629144155570723e-11, + +9.67580903537323691224e-11, + -5.18979560163526290666e-10, + +2.65982372468238665035e-09, + -1.30002500998624804212e-08, + +6.04699502254191894932e-08, + -2.67079385394061173391e-07, + +1.11738753912010371815e-06, + -4.41673835845875056359e-06, + +1.64484480707288970893e-05, + -5.75419501008210370398e-05, + +1.88502885095841655729e-04, + -5.76375574538582365885e-04, + +1.63947561694133579842e-03, + -4.32430999505057594430e-03, + +1.05464603945949983183e-02, + -2.37374148058994688156e-02, + +4.93052842396707084878e-02, + -9.49010970480476444210e-02, + +1.71620901522208775349e-01, + -3.04682672343198398683e-01, + +6.76795274409476084995e-01, + }; + + static const T B[] = { + -7.23318048787475395456e-18, + -4.83050448594418207126e-18, + +4.46562142029675999901e-17, + +3.46122286769746109310e-17, + -2.82762398051658348494e-16, + -3.42548561967721913462e-16, + +1.77256013305652638360e-15, + +3.81168066935262242075e-15, + -9.55484669882830764870e-15, + -4.15056934728722208663e-14, + +1.54008621752140982691e-14, + +3.85277838274214270114e-13, + +7.18012445138366623367e-13, + -1.79417853150680611778e-12, + -1.32158118404477131188e-11, + -3.14991652796324136454e-11, + +1.18891471078464383424e-11, + +4.94060238822496958910e-10, + +3.39623202570838634515e-09, + +2.26666899049817806459e-08, + +2.04891858946906374183e-07, + +2.89137052083475648297e-06, + +6.88975834691682398426e-05, + +3.36911647825569408990e-03, + +8.04490411014108831608e-01, + }; + + T p; + T q = 0.0; + + if (std::abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 30; index++) { + p = q; + q = a; + a = ((std::abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + return std::exp(std::abs(x)) * (T(0.5) * (a - p)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / std::abs(x) - T(2.0)) * q - p + B[index]; + } + + return std::exp(std::abs(x)) * (T(0.5) * (b - p)) / std::sqrt(std::abs(x)); +} // modified_bessel_i0_forward(T x) + +template +inline C10_HOST_DEVICE T modified_bessel_i1_forward(T x) { + static const T A[] = { + +2.77791411276104639959e-18, + -2.11142121435816608115e-17, + +1.55363195773620046921e-16, + -1.10559694773538630805e-15, + +7.60068429473540693410e-15, + -5.04218550472791168711e-14, + +3.22379336594557470981e-13, + -1.98397439776494371520e-12, + +1.17361862988909016308e-11, + -6.66348972350202774223e-11, + +3.62559028155211703701e-10, + -1.88724975172282928790e-09, + +9.38153738649577178388e-09, + -4.44505912879632808065e-08, + +2.00329475355213526229e-07, + -8.56872026469545474066e-07, + +3.47025130813767847674e-06, + -1.32731636560394358279e-05, + +4.78156510755005422638e-05, + -1.61760815825896745588e-04, + +5.12285956168575772895e-04, + -1.51357245063125314899e-03, + +4.15642294431288815669e-03, + -1.05640848946261981558e-02, + +2.47264490306265168283e-02, + -5.29459812080949914269e-02, + +1.02643658689847095384e-01, + -1.76416518357834055153e-01, + +2.52587186443633654823e-01, + }; + + static const T B[] = { + +7.51729631084210481353e-18, + +4.41434832307170791151e-18, + -4.65030536848935832153e-17, + -3.20952592199342395980e-17, + +2.96262899764595013876e-16, + +3.30820231092092828324e-16, + -1.88035477551078244854e-15, + -3.81440307243700780478e-15, + +1.04202769841288027642e-14, + +4.27244001671195135429e-14, + -2.10154184277266431302e-14, + -4.08355111109219731823e-13, + -7.19855177624590851209e-13, + +2.03562854414708950722e-12, + +1.41258074366137813316e-11, + +3.25260358301548823856e-11, + -1.89749581235054123450e-11, + -5.58974346219658380687e-10, + -3.83538038596423702205e-09, + -2.63146884688951950684e-08, + -2.51223623787020892529e-07, + -3.88256480887769039346e-06, + -1.10588938762623716291e-04, + -9.76109749136146840777e-03, + +7.78576235018280120474e-01, + }; + + T p; + T q = 0.0; + + if (std::abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 29; index++) { + p = q; + q = a; + a = ((std::abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + if (x < T(0.0)) { + return -(T(0.5) * (a - p) * std::abs(x) * std::exp(std::abs(x))); + } + + return T(0.5) * (a - p) * std::abs(x) * std::exp(std::abs(x)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / std::abs(x) - T(2.0)) * q - p + B[index]; + } + + if (x < T(0.0)) { + return -(std::exp(std::abs(x)) * (T(0.5) * (b - p)) / std::sqrt(std::abs(x))); + } + + return std::exp(std::abs(x)) * (T(0.5) * (b - p)) / std::sqrt(std::abs(x)); +} // modified_bessel_i1_forward(T x) + +template +inline C10_HOST_DEVICE T modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return std::numeric_limits::infinity(); + } + + if (x < T(0.0)) { + return std::numeric_limits::quiet_NaN(); + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return T(0.5) * (a - p) - std::log(0.5 * x) * modified_bessel_i0_forward(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return std::exp(-x) * (T(0.5) * (b - p)) / std::sqrt(x); +} // modified_bessel_k0_forward(T x) + +template +inline C10_HOST_DEVICE T modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return std::numeric_limits::infinity(); + } + + if (x < T(0.0)) { + return std::numeric_limits::quiet_NaN(); + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return std::log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x; + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return std::exp(-x) * (T(0.5) * (b - p)) / std::sqrt(x); +} // modified_bessel_k1_forward(T x) + +template +inline C10_HOST_DEVICE T scaled_modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return std::numeric_limits::infinity(); + } + + if (x < T(0.0)) { + return std::numeric_limits::quiet_NaN(); + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint64_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (T(0.5) * (a - p) - std::log(T(0.5) * x) * modified_bessel_i0_forward(x)) * std::exp(x); + } + + T b = B[0]; + + for (uint64_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return T(0.5) * (b - p) / std::sqrt(x); +} // T scaled_modified_bessel_k0_forward(T x) + +template +inline C10_HOST_DEVICE T scaled_modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return std::numeric_limits::infinity(); + } + + if (x < T(0.0)) { + return std::numeric_limits::quiet_NaN(); + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint64_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (std::log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * std::exp(x); + } + + T b = B[0]; + + for (uint64_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return (T(0.5) * (b - p) / std::sqrt(x)); +} // T scaled_modified_bessel_k1_forward(T x) + +template +inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) { + return std::cos(n * std::acos(x + x - T(1.0))); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; +} // shifted_chebyshev_polynomial_t_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, T n) { + return shifted_chebyshev_polynomial_t_forward(x, static_cast(n)); +} // shifted_chebyshev_polynomial_t_forward(T x, T n) + +template +inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) { + if (std::sin(std::acos(x + x - T(1.0))) != T(0.0)) { + return std::sin((n + 1) * std::acos(x + x - T(1.0))) / std::sin(std::acos(x + x - T(1.0))); + } + + return (n + 1) * std::cos((n + 1) * std::acos(x + x - T(1.0))) / (x + x - T(1.0)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)); + T r; + + for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; +} // shifted_chebyshev_polynomial_u_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, T n) { + return shifted_chebyshev_polynomial_u_forward(x, static_cast(n)); +} // shifted_chebyshev_polynomial_u_forward(T x, T n) + +template +inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return (n + n + 1); + } + + return -(n + n + 1); + } + + if ((n > 6) && (std::abs(x + x - T(1.0)) < T(1.0))) { + if (std::sin(std::acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return std::cos((n + T(0.5)) * std::acos(x + x - T(1.0))) / std::cos(std::acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; +} // shifted_chebyshev_polynomial_v_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, T n) { + return shifted_chebyshev_polynomial_v_forward(x, static_cast(n)); +} // shifted_chebyshev_polynomial_v_forward(T x, T n) + +template +inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 4) && (std::abs(x + x - T(1.0)) < T(1.0))) { + if (std::cos(std::acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return std::sin((n + T(0.5)) * std::acos(x + x - T(1.0))) / std::sin(std::acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; +} // shifted_chebyshev_polynomial_w_forward(T x, int64_t n) + +template +inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, T n) { + return shifted_chebyshev_polynomial_w_forward(x, static_cast(n)); +} // shifted_chebyshev_polynomial_w_forward(T x, T n) + +template +inline C10_HOST_DEVICE T spherical_bessel_j0_forward(T x) { + if (std::isinf(x)) { + return T(0.0); + } + + if (std::abs(x) < T(0.5)) { + return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0))))))); + } + + return std::sin(x) / x; +} // T spherical_bessel_j0_forward(T x) + +C10_CLANG_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h new file mode 100644 index 0000000000000000000000000000000000000000..3cdb4465af19f8c07cfbea06e2c17f69f64fe21a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h @@ -0,0 +1,76 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at { +// views and their in-place version ops +#define TORCH_VIEW_FNS(m) \ + m.impl("as_strided_", torch::CppFunction::makeFallthrough()); \ + m.impl("detach", torch::CppFunction::makeFallthrough()); \ + m.impl("detach_", torch::CppFunction::makeFallthrough()); \ + m.impl("diagonal", torch::CppFunction::makeFallthrough()); \ + m.impl("expand", torch::CppFunction::makeFallthrough()); \ + m.impl("expand_as", torch::CppFunction::makeFallthrough()); \ + m.impl("movedim.int", torch::CppFunction::makeFallthrough()); \ + m.impl("movedim.intlist", torch::CppFunction::makeFallthrough()); \ + m.impl("narrow", torch::CppFunction::makeFallthrough()); \ + m.impl("permute", torch::CppFunction::makeFallthrough()); \ + m.impl("select.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("select.int", torch::CppFunction::makeFallthrough()); \ + m.impl("squeeze", torch::CppFunction::makeFallthrough()); \ + m.impl("squeeze_", torch::CppFunction::makeFallthrough()); \ + m.impl("transpose.int", torch::CppFunction::makeFallthrough()); \ + m.impl("transpose.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("transpose_", torch::CppFunction::makeFallthrough()); \ + m.impl("t", torch::CppFunction::makeFallthrough()); \ + m.impl("t_", torch::CppFunction::makeFallthrough()); \ + m.impl("real", torch::CppFunction::makeFallthrough()); \ + m.impl("imag", torch::CppFunction::makeFallthrough()); \ + m.impl("view_as_real", torch::CppFunction::makeFallthrough()); \ + m.impl("unflatten.int", torch::CppFunction::makeFallthrough()); \ + m.impl("unflatten.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("unfold", torch::CppFunction::makeFallthrough()); \ + m.impl("unsqueeze", torch::CppFunction::makeFallthrough()); \ + m.impl("unsqueeze_", torch::CppFunction::makeFallthrough()); \ + m.impl("view_as", torch::CppFunction::makeFallthrough()); \ + m.impl("unbind.int", torch::CppFunction::makeFallthrough()); \ + m.impl("unbind.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("split.Tensor", torch::CppFunction::makeFallthrough()); \ + m.impl("split_with_sizes", torch::CppFunction::makeFallthrough()); \ + m.impl("swapaxes", torch::CppFunction::makeFallthrough()); \ + m.impl("swapdims", torch::CppFunction::makeFallthrough()); \ + m.impl("chunk", torch::CppFunction::makeFallthrough()); \ + m.impl("reshape", torch::CppFunction::makeFallthrough()); \ + m.impl("alias", torch::CppFunction::makeFallthrough()); \ + m.impl("hsplit.int", torch::CppFunction::makeFallthrough()); \ + m.impl("hsplit.array", torch::CppFunction::makeFallthrough()); \ + m.impl("dsplit.int", torch::CppFunction::makeFallthrough()); \ + m.impl("dsplit.array", torch::CppFunction::makeFallthrough()); \ + m.impl("vsplit.int", torch::CppFunction::makeFallthrough()); \ + m.impl("vsplit.array", torch::CppFunction::makeFallthrough()); \ + m.impl("conj", torch::CppFunction::makeFallthrough()); \ + m.impl("_conj", torch::CppFunction::makeFallthrough()); \ + m.impl("_unsafe_view", torch::CppFunction::makeFallthrough()); \ + m.impl("resize_", torch::CppFunction::makeFallthrough()); + +#define TENSOR_UTILITIES_AND_CONSTRUCTORS(m) \ + m.impl("empty_like", torch::CppFunction::makeFallthrough()); \ + m.impl("empty.memory_format", torch::CppFunction::makeFallthrough()); \ + m.impl("empty.out", torch::CppFunction::makeFallthrough()); \ + m.impl("empty_strided", torch::CppFunction::makeFallthrough()); \ + m.impl("full_like", torch::CppFunction::makeFallthrough()); \ + m.impl("stride.int", torch::CppFunction::makeFallthrough()); \ + m.impl("stride.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("size.int", torch::CppFunction::makeFallthrough()); \ + m.impl("size.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("is_complex", torch::CppFunction::makeFallthrough()); \ + m.impl("is_floating_point", torch::CppFunction::makeFallthrough()); \ + m.impl("requires_grad_", torch::CppFunction::makeFallthrough()); +} + +#define TORCH_VIEW_FNS_NATIVE_FN_REGISTRATION(m) \ + m.impl("as_strided", torch::CppFunction::makeFallthrough()); \ + m.impl("view", torch::CppFunction::makeFallthrough()); + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/MathBitsFallback.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/MathBitsFallback.h new file mode 100644 index 0000000000000000000000000000000000000000..f022190da40fe9898d3701a37b8fdbc9c7f09613 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/MathBitsFallback.h @@ -0,0 +1,162 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include + +#include +#endif + +namespace at::native { +// This fallback should only be used for operations that are self inverse and have a corresponding tensor +// bit (internally implemented using DispatchKey) to maintain the state on tensor using tensor bit. +// Currently there are two tensor bits that trigger this fallback: conjugate bit and negative bit. +// Conjugate bit is set on a tensor when `.conj()` is called and neg bit is set on a tensor when `.conj().imag` is called. + +// NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit. +struct MathOpFallback { + MathOpFallback(DispatchKey key_, std::string op_name_) : key(key_), op_name(std::move(op_name_)) {} + virtual bool is_bit_set(const Tensor&) = 0; + void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { + /* + Situations to handle: + 1. Out-of-place operation. Easy: materialize all inputs and + call it a day. + 2. Inplace operation. Desugar x.add_(2) into x.conj_().add_(2).conj_(). + Materialize other inputs as in (1). + 3. out= operation. Desugar add(x, 2, out=y) into y.copy_(add(x, 2)) + Materialize other inputs as in (1). + + It is important to be able to tell if we READ from an argument and if we + WRITE to an argument. Conservative approach is to assume that we always + READ from an argument, but in out= operations you can skip + conjugating inputs on entry that never get used. In the current schema we + can't easily tell if the operation is in in-place or out= operation. + + Note: + 1. Mutable tensorlists containing tensors whose math bit set to true are disallowed. + 2. Mutable tensors with math bit set to true are unconditionally cloned to ensure + correct behavior in the case when the mutable tensor shares memory with non mutable arguments. + + If we were to in-place resolve the math bit for mutable inputs, then the non-mutable inputs sharing partial or full memory + with these mutable inputs would read into wrong values in the following cases: + 1. Non mutable inputs have their math bit set to false. + 2. Math bit for mutable input(s) is resolved before the non mutable inputs (with bit set to true and sharing memory + with one or more mutable arg(s)) are cloned. + At the end, the final value of the mutable arguments from the stack are copied into the original input mutable tensor inputs. + */ + const auto& arguments = op.schema().arguments(); + const auto num_arguments = arguments.size(); + const auto stack_start = stack->size() - num_arguments; + + std::optional is_write; + for (const auto i : c10::irange(num_arguments)) { + // Three possible states: + // 1. alias_info has no value --> out-of-place operation + // 2. alias_info does have a value, alias_info->is_write=True --> in-place or out= operation + // 3. alias_info does have a value, alias_info->is_write=False --> view operation + const AliasInfo* alias_info = arguments[i].alias_info(); + if (alias_info != nullptr) { + if (is_write.has_value()) { + TORCH_CHECK(*is_write == alias_info->isWrite(), + "Unsupported operator for ", op_name, " fallback: ", op.schema().name(), + op_name, " fallback doesn't work for operators with a mix " + "mutable and non-mutable inputs that alias with outputs, " + "this must be implemented manually. " + "If you got this error on a core op, please report a bug to PyTorch."); + } else { + is_write = alias_info->isWrite(); + } + } + } + + if (is_write.has_value() && !*is_write) { + // We assume that view operators automatically handle the math bit + // correctly by propagating the dispatch key in key_set. + // This is not necessarily always right, so you should test these cases. + op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack); + return; + } + + // Mutable inputs with math bit set to True and their clones + std::vector> mutable_inputs_with_their_clones; + for (const auto i : c10::irange(num_arguments)) { + auto& ivalue = (*stack)[stack_start + i]; + if (!(ivalue.isTensor() || ivalue.isTensorList())) { + continue; + } + const auto& argument = arguments[i]; + bool mut_arg = false; + if (argument.alias_info()) { + // Was already tested by is_write loop above + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(argument.alias_info()->isWrite()); + mut_arg = true; + } + if (ivalue.isTensor()) { + if (!is_bit_set(ivalue.toTensor())) { + continue; + } + auto tensor = std::move(ivalue).toTensor(); + auto resolved_tensor = at::clone(tensor); + if (mut_arg) { + TORCH_CHECK(mutable_inputs_with_their_clones.empty(), op_name, " fallback does not support operators with more than one mutable tensors with ", + op_name, "bit set to true."); + mutable_inputs_with_their_clones.emplace_back(std::move(tensor), resolved_tensor); + } + (*stack)[stack_start + i] = std::move(resolved_tensor); + } else if (ivalue.isTensorList()) { + auto tensors = std::move(ivalue).toTensorList(); + for(const auto j : c10::irange(tensors.size())) { + const auto& tensor = tensors[j]; + if (!is_bit_set(tensor)) { + continue; + } + TORCH_CHECK(!mut_arg, " fallback doesn't currently support mutable TensorLists with ", + op_name, " inputs. Please materialize all the ", op_name, " input tensor(s) in the mutable TensorList inputs before calling ", + op.schema().name()); + tensors[j] = at::clone(tensor); + } + (*stack)[stack_start + i] = std::move(tensors); + } + } + + op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack); + + TORCH_INTERNAL_ASSERT(mutable_inputs_with_their_clones.size() <= 1); + + for (std::pair mut_tensors: mutable_inputs_with_their_clones) { + auto& mutable_input = mut_tensors.first; + auto& cloned_mutable_input = mut_tensors.second; + auto& ivalue = (*stack)[stack_start]; + auto returned_output = std::move(ivalue).toTensor(); + + // sanity check to ensure that the tensor in stack aliases the cloned_mutable_input + TORCH_INTERNAL_ASSERT(cloned_mutable_input.is_same(returned_output)); + + // necessary for out= arg + at::native::resize_output(mutable_input, returned_output.sizes()); + + mutable_input.copy_(returned_output); + (*stack)[stack_start] = std::move(mutable_input); + } + } + + virtual ~MathOpFallback() = default; + + DispatchKey key; + std::string op_name; +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/MaxPooling.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/MaxPooling.h new file mode 100644 index 0000000000000000000000000000000000000000..dab605a789f9da5892c999d3534fcdca88e043bb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/MaxPooling.h @@ -0,0 +1,102 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +inline void check_max_pool1d( + const Tensor& self, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool ceil_mode) { + + TORCH_CHECK( + self.dim() == 2 || self.dim() == 3, + "max_pool1d() Expected 2D or 3D input tensor, but got ", self.sym_sizes()); + TORCH_CHECK( + kernel_size.size() == 1, + "max_pool1d() kernel_size must be an int, list of ints or tuple of ints of size 1 but got size ", + kernel_size.size()); + TORCH_CHECK( + stride.empty() || stride.size() == 1, + "max_pool1d() stride must be None, an int, list of ints, or tuple of ints of size 1 but got size ", + stride.size()); + TORCH_CHECK( + padding.size() == 1, + "max_pool1d() padding must be an int, list of ints, or tuple of ints of size 1 but got size ", + padding.size()); + TORCH_CHECK( + dilation.size() == 1, + "max_pool1d() dilation must be an int, list of ints or tuple of ints of size 1 but got size ", + dilation.size()); + + // If stride=None then set it to kernel_size + if (stride.empty()) { + stride = kernel_size; + } + + TORCH_CHECK( + kernel_size[0] > 0, + "max_pool1d() kernel_size must be greater than zero, but got ", + kernel_size[0]); + TORCH_CHECK( + stride[0] > 0, "max_pool1d() stride must be greater than zero, but got ", stride[0]); + TORCH_CHECK( + padding[0] >= 0, "max_pool1d() padding must be non-negative, but got ", padding[0]); + TORCH_CHECK( + padding[0] <= kernel_size[0] / 2, + "max_pool1d() padding should be at most half of kernel size, but got padding=", + padding[0], + " and kernel_size=", + kernel_size[0]); + TORCH_CHECK( + dilation[0] > 0, "max_pool1d() dilation must be greater than zero, but got ", dilation[0]); + + const int64_t OW = pooling_output_shape(self.sym_size(-1).guard_int(__FILE__, __LINE__), kernel_size[0], padding[0], stride[0], dilation[0], ceil_mode); + TORCH_CHECK(OW > 0, "max_pool1d() Invalid computed output size: ", OW); +} + +// TODO(Heitor) Template by dimension +struct PoolingParams1D { + int64_t NB; // Number of batches + int64_t NC; // Number of channels + int64_t IW; // Input width + int64_t OW; // Output width + int64_t KW; // Kernel width + int64_t SJ; // Column stride + int64_t PJ; // Column padding + int64_t DJ; // Column dilation + + // Return index of input element for the given kernel and output index + inline int64_t index(int64_t kj, int64_t oj) const { + return oj * SJ + kj * DJ - PJ; + } + + // Return index of first output within bounds for this kernel index + inline int64_t valid_output_start(int64_t kj) const { + int64_t ij = index(kj, 0);; + return ij < 0 ? at::divup(-ij, SJ) : 0; + } + + // Return index one past last output within bounds for this kernel index + inline int64_t valid_output_end(int64_t kj) const { + int64_t ij = index(kj, OW - 1); + return ij >= IW ? OW - at::divup(ij - (IW - 1), SJ) : OW; + } +}; + +using pooling_fn = void (*)(Tensor&, const Tensor&, const PoolingParams1D&); + +DECLARE_DISPATCH(pooling_fn, max_pool1d_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/NonEmptyUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/NonEmptyUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..8eff5eb7714571a6c66513b78e35d5dfd02aca90 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/NonEmptyUtils.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include + +namespace at::native { + +inline int64_t ensure_nonempty_dim(int64_t dim) { + return std::max(dim, 1); +} + +inline int64_t ensure_nonempty_size(const TensorBase &t, int64_t dim) { + return t.dim() == 0 ? 1 : t.size(dim); +} + +inline int64_t ensure_nonempty_stride(const TensorBase &t, int64_t dim) { + return t.dim() == 0 ? 1 : t.stride(dim); +} + +using IdxVec = std::vector; +inline IdxVec ensure_nonempty_vec(IdxVec vec) { + if (vec.empty()) { + vec.push_back(1); + } + return vec; +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/NonSymbolicBC.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/NonSymbolicBC.h new file mode 100644 index 0000000000000000000000000000000000000000..50a20a19a801b7b5b3d5b5f53a87a84a623f74f9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/NonSymbolicBC.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + +namespace at::native { +// This file contains non-symbolic signatures for ops that we have sym-intified the signature of. +// However, in certain cases (such as static runtime), we call the native versions of the ops directly. +// In those cases, we will duplicate the signature here with non-symbolic ints, and also duplicate the C++ implementation. +TORCH_API at::Tensor reshape(const at::Tensor& self, at::IntArrayRef proposed_shape); +TORCH_API at::Tensor narrow(const at::Tensor& self, int64_t dim, int64_t start, int64_t length); +TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, std::optional dtype=std::nullopt, std::optional layout=std::nullopt, std::optional device=std::nullopt, std::optional pin_memory=std::nullopt, std::optional is_coalesced=std::nullopt); +TORCH_API at::Tensor nll_loss(const at::Tensor & self, const at::Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index); +TORCH_API at::Tensor nll_loss2d(const at::Tensor & self, const at::Tensor & target, const std::optional& weight_opt, int64_t reduction, int64_t ignore_index); +// The below ops don't get a duplicated C++ implementation. +// They are backward ops, which make them very unlikely to be called directly +// by external code (at::native::trace_backward). +// They get their own declaration for BC purposes however. +TORCH_API at::Tensor _embedding_bag_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const std::optional & per_sample_weights, int64_t padding_idx=-1); +TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const std::optional & per_sample_weights, int64_t padding_idx=-1); +TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim); +TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes); +TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor select(const at::Tensor& self, int64_t dim, int64_t index); +TORCH_API std::vector tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim); +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Normalization.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Normalization.h new file mode 100644 index 0000000000000000000000000000000000000000..34fcd64e33aea5d838a8d640fed0d54c0b36f6fa --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Normalization.h @@ -0,0 +1,24 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +using renorm_scale_factor_fn = void (*) (TensorIteratorBase& iter, double maxnorm); +DECLARE_DISPATCH(renorm_scale_factor_fn, renorm_scale_factor_stub) + +enum class BatchNormBackend { + Native, + Cudnn, + Miopen, +}; + +TORCH_API BatchNormBackend _select_batch_norm_backend(const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool training, double eps); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Padding.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Padding.h new file mode 100644 index 0000000000000000000000000000000000000000..ba9f7825c55ffbda03ac375531459b135240f70b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Padding.h @@ -0,0 +1,68 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +using padding_fn = void (*)(const Tensor&, const Tensor&, IntArrayRef); + +// reflection padding +DECLARE_DISPATCH(padding_fn, reflection_pad1d_kernel) +DECLARE_DISPATCH(padding_fn, reflection_pad1d_backward_kernel) +DECLARE_DISPATCH(padding_fn, reflection_pad2d_kernel) +DECLARE_DISPATCH(padding_fn, reflection_pad2d_backward_kernel) +DECLARE_DISPATCH(padding_fn, reflection_pad3d_kernel) +DECLARE_DISPATCH(padding_fn, reflection_pad3d_backward_kernel) + +// replication padding +DECLARE_DISPATCH(padding_fn, replication_pad1d_kernel) +DECLARE_DISPATCH(padding_fn, replication_pad1d_backward_kernel) +DECLARE_DISPATCH(padding_fn, replication_pad2d_kernel) +DECLARE_DISPATCH(padding_fn, replication_pad2d_backward_kernel) +DECLARE_DISPATCH(padding_fn, replication_pad3d_kernel) +DECLARE_DISPATCH(padding_fn, replication_pad3d_backward_kernel) + +namespace padding { + +template +inline void check_valid_input(const Tensor& input, IntArrayRef padding) { + + TORCH_CHECK(padding.size() == 2 * dim, + "padding size is expected to be ", 2 * dim, + ", but got: ", padding.size()); + + int input_dim = input.dim(); + + bool is_batch_mode = input_dim == (dim + 2); + bool is_non_batch_mode = input_dim == (dim + 1); + + bool valid_batch_mode = is_batch_mode; + bool valid_non_batch_mode = is_non_batch_mode; + + if (is_batch_mode) { + // allow batch size of 0-dim. + for (const auto d : c10::irange(1, input_dim)) { + valid_batch_mode = valid_batch_mode && input.size(d) != 0; + } + } else { + for (const auto d : c10::irange(0, input_dim)) { + valid_non_batch_mode = valid_non_batch_mode && input.size(d) != 0; + } + } + + // allow empty batch size but not other dimensions. + TORCH_CHECK(valid_batch_mode || valid_non_batch_mode, + "Expected ", dim + 1, "D or ", dim + 2, + "D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); +} + +} // namespace padding + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/PixelShuffle.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/PixelShuffle.h new file mode 100644 index 0000000000000000000000000000000000000000..856b015a35153bc45fb5320642d1c79fcf564f64 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/PixelShuffle.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include + +namespace at::native { + +inline void check_pixel_shuffle_shapes(const Tensor& self, int64_t upscale_factor) { + TORCH_CHECK(self.dim() >= 3, + "pixel_shuffle expects input to have at least 3 dimensions, but got input with ", + self.dim(), " dimension(s)"); + TORCH_CHECK(upscale_factor > 0, + "pixel_shuffle expects a positive upscale_factor, but got ", + upscale_factor); + int64_t c = self.size(-3); + TORCH_CHECK_VALUE(upscale_factor <= std::numeric_limits::max() / upscale_factor, + "upscale factor is too large, (upscale_factor)^2 overflowed: upscale_factor=", upscale_factor); + int64_t upscale_factor_squared = upscale_factor * upscale_factor; + TORCH_CHECK(c % upscale_factor_squared == 0, + "pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of " + "upscale_factor, but input.size(-3)=", c, " is not divisible by ", upscale_factor_squared); +} + +inline void check_pixel_unshuffle_shapes(const Tensor& self, int64_t downscale_factor) { + TORCH_CHECK( + self.dim() >= 3, + "pixel_unshuffle expects input to have at least 3 dimensions, but got input with ", + self.dim(), + " dimension(s)"); + TORCH_CHECK( + downscale_factor > 0, + "pixel_unshuffle expects a positive downscale_factor, but got ", + downscale_factor); + int64_t h = self.size(-2); + int64_t w = self.size(-1); + TORCH_CHECK( + h % downscale_factor == 0, + "pixel_unshuffle expects height to be divisible by downscale_factor, but input.size(-2)=", + h, + " is not divisible by ", + downscale_factor); + TORCH_CHECK( + w % downscale_factor == 0, + "pixel_unshuffle expects width to be divisible by downscale_factor, but input.size(-1)=", + w, + " is not divisible by ", + downscale_factor); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/PointwiseOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/PointwiseOps.h new file mode 100644 index 0000000000000000000000000000000000000000..6a1dbfd7365ba7d264faacc07dde79920e8dec98 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/PointwiseOps.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Ternary and higher-order pointwise operations +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { + +struct TensorIterator; +struct TensorIteratorBase; + +namespace native { + +using pointwise_fn = void (*)(TensorIterator&, const Scalar& scalar); +using structured_pointwise_fn = void (*)(TensorIteratorBase&, const Scalar& scalar); +using pointwise_fn_double = void (*)(TensorIterator&, const Scalar&, double); + +DECLARE_DISPATCH(structured_pointwise_fn, addcmul_stub) +DECLARE_DISPATCH(structured_pointwise_fn, addcdiv_stub) +DECLARE_DISPATCH(pointwise_fn_double, smooth_l1_backward_stub) +DECLARE_DISPATCH(pointwise_fn_double, huber_backward_stub) +DECLARE_DISPATCH(pointwise_fn, mse_backward_stub) + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Pool.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Pool.h new file mode 100644 index 0000000000000000000000000000000000000000..b2d2a054df6c6b66146ef004596bf7749869365e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Pool.h @@ -0,0 +1,366 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include + +#include + +#pragma once + +namespace at::native { + +using max_pool2d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, + int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH); +using max_pool2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); + +DECLARE_DISPATCH(max_pool2d_fn, max_pool2d_kernel) +DECLARE_DISPATCH(max_pool2d_backward_fn, max_pool2d_backward_kernel) + +// average pooling has same signature for forward and backward +using avg_pool2d_fn = void(*)(const Tensor& output, const Tensor& input, int64_t kW, int64_t kH, + int64_t dW, int64_t dH, int64_t padW, int64_t padH, bool count_include_pad, std::optional divisor_override); +using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input, int kW, int kH, + int dW, int dH, int padW, int padH, bool count_include_pad, std::optional divisor_override); + +DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel) +DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel) + +// average pooling has same signature for forward and backward +using avg_pool3d_fn = void(*)(const Tensor& output, const Tensor& input, + int64_t kW, int64_t kH, int64_t kD, int64_t dW, int64_t dH, int64_t dD, + int64_t padW, int64_t padH, int64_t padD, bool count_include_pad, + std::optional divisor_override); +using avg_pool3d_backward_fn = void(*)(const Tensor& output, const Tensor& input, + int kW, int kH, int kD, int dW, int dH, int dD, + int padW, int padH, int padD, bool count_include_pad, + std::optional divisor_override); + +DECLARE_DISPATCH(avg_pool3d_fn, avg_pool3d_kernel) +DECLARE_DISPATCH(avg_pool3d_backward_fn, avg_pool3d_backward_kernel) + +using max_pool3d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input, + int kW, int kH, int kD, int dW, int dH, int dD, int pW, int pH, int pD, int dilationW, int dilationH, int dilationD); +using max_pool3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); + +DECLARE_DISPATCH(max_pool3d_fn, max_pool3d_kernel) +DECLARE_DISPATCH(max_pool3d_backward_fn, max_pool3d_backward_kernel) +namespace { + +template +inline dest_t +safe_downcast(src_t v) +{ + TORCH_CHECK(std::numeric_limits::min() <= v && v <= std::numeric_limits::max(), + "integer out of range"); + + return static_cast(v); +} + +template +inline T pooling_output_shape_pad_lr( + T inputSize, T kernelSize, T pad_l, T pad_r, T stride, T dilation, + bool ceil_mode) { + T outputSize = div_rtn( + inputSize + pad_l + pad_r - dilation * (kernelSize - 1) - 1 + + (ceil_mode ? stride - 1 : 0), stride) + 1; + if (ceil_mode) { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputSize - 1) * stride >= inputSize + pad_l) { + --outputSize; + } + } + return outputSize; +} + +template +inline T pooling_output_shape( + T inputSize, T kernelSize, T pad, T stride, T dilation, bool ceil_mode) { + TORCH_CHECK(stride != 0, "stride should not be zero"); + TORCH_CHECK(pad >= 0, + "pad must be non-negative, but got pad: ", pad); + TORCH_CHECK(pad <= ((kernelSize - 1) * dilation + 1) / 2, + "pad should be at most half of effective kernel size, but got pad=", + pad, ", kernel_size=", kernelSize, " and dilation=", dilation) + return pooling_output_shape_pad_lr( + inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode); +} + +template +std::pair _pooling_same_mode_padding_lr( + T inputSize, T kernelSize, T stride, T dilation) { + // NOTE: with strides, the output shape is ceil(inputSize/stride) + auto total_padding = T(dilation) * (kernelSize - 1); + + // Prefer symmetric padding if possible + if (stride > 2 && (total_padding % 2 == 1)) { + // The floor in the output size calculation gives us a little wiggle room + auto wiggle_room = inputSize % stride - 1; + if (wiggle_room > 0) { + total_padding = total_padding - 1; + } + } + + auto left = total_padding / 2; + return {left, total_padding - left}; +} + +inline std::pair pooling_same_mode_padding_lr( + int64_t inputSize, int64_t kernelSize, int64_t stride, int64_t dilation) { + return _pooling_same_mode_padding_lr(inputSize, kernelSize, stride, dilation); +} + +inline std::pair pooling_same_mode_padding_lr( + c10::SymInt inputSize, c10::SymInt kernelSize, c10::SymInt stride, c10::SymInt dilation) { + return _pooling_same_mode_padding_lr(std::move(inputSize), std::move(kernelSize), std::move(stride), std::move(dilation)); +} + +// AveragePool2d/DilatedMaxPool2d (forward) +inline void +pool2d_shape_check( + const Tensor& input, + int64_t kH, int64_t kW, int64_t dH, int64_t dW, int64_t padH, int64_t padW, int64_t dilationH, int64_t dilationW, + int64_t nInputPlane, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth, MemoryFormat memory_format) +{ + const int64_t ndim = input.ndimension(); +#ifndef STRIP_ERROR_MESSAGES + const int64_t nOutputPlane = nInputPlane; +#endif + + TORCH_CHECK(kW > 0 && kH > 0, + "kernel size should be greater than zero, but got ", + "kH: ", kH, " kW: ", kW); + TORCH_CHECK(dW > 0 && dH > 0, + "stride should be greater than zero, but got " + "dH: ", dH, " dW: ", dW); + TORCH_CHECK(dilationH > 0 && dilationW > 0, + "dilation should be greater than zero, but got ", + "dilationH: ", dilationH, " dilationW: ", dilationW); + + bool valid_dims = input.size(1) != 0 && input.size(2) != 0; + if (memory_format == at::MemoryFormat::ChannelsLast){ + // Expect tensor in NHWC format and allow 0-dim only for N. + TORCH_CHECK((ndim == 4 && valid_dims && input.size(3) != 0), + "Expected 4D (batch mode) tensor expected for input with channels_last layout" + " with optional 0 dim batch size for input, but got: ", input.sizes()); + } else { + TORCH_CHECK((ndim == 3 && input.size(0) != 0 && valid_dims) || + (ndim == 4 && valid_dims && input.size(3) != 0), + "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got:", + input.sizes()); + } + + TORCH_CHECK(kW/2 >= padW && kH/2 >= padH, + "pad should be smaller than or equal to half of kernel size, but got ", + "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH); + + TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1, + "Given input size: (", + nInputPlane, "x", inputHeight, "x", inputWidth, "). ", + "Calculated output size: (", + nOutputPlane, "x", outputHeight, "x", outputWidth, "). ", + "Output size is too small"); +} + +// DilatedMaxPool2d (backward) +inline void +max_pool2d_backward_shape_check( + const Tensor& input, + const Tensor& gradOutput, + const Tensor& indices, + int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, + int64_t nInputPlane, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth, MemoryFormat memory_format) +{ + pool2d_shape_check( + input, + kH, kW, dH, dW, padH, padW, dilationH, dilationW, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, memory_format); + + const int64_t ndim = input.ndimension(); + const int64_t nOutputPlane = nInputPlane; + + check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane); + check_dim_size(gradOutput, ndim, ndim-2, outputHeight); + check_dim_size(gradOutput, ndim, ndim-1, outputWidth); + + check_dim_size(indices, ndim, ndim-3, nOutputPlane); + check_dim_size(indices, ndim, ndim-2, outputHeight); + check_dim_size(indices, ndim, ndim-1, outputWidth); + + if (ndim == 4) { + const int64_t batchSize = input.size(0); + check_dim_size(gradOutput, ndim, 0, batchSize); + check_dim_size(indices, ndim, 0, batchSize); + } +} + +// AveragePool2d (backward) +inline void +avg_pool2d_backward_shape_check( + const Tensor& input, + const Tensor& gradOutput, + int64_t /*nbatch*/, + int kH, int kW, int dH, int dW, int padH, int padW, + int64_t nInputPlane, + int64_t inputHeight, int64_t inputWidth, + int64_t outputHeight, int64_t outputWidth, + MemoryFormat memory_format) +{ + pool2d_shape_check( + input, + kH, kW, dH, dW, padH, padW, 1, 1, + nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, + memory_format); + + const int64_t ndim = input.ndimension(); + const int64_t nOutputPlane = nInputPlane; + + check_dim_size(gradOutput, ndim, ndim-3, nOutputPlane); + check_dim_size(gradOutput, ndim, ndim-2, outputHeight); + check_dim_size(gradOutput, ndim, ndim-1, outputWidth); +} + +// AveragePool3d/DilatedMaxPool3d (forward) +inline void +pool3d_shape_check( + const Tensor& input, + int64_t nslices, + int kT, int kH, int kW, + int dT, int dH, int dW, + int pT, int pH, int pW, + int dilationT, int dilationH, int dilationW, + int64_t itime, int64_t iheight, int64_t iwidth, + int64_t otime, int64_t oheight, int64_t owidth, + const char *fn_name, + bool check_input_size=false) +{ + const int64_t ndim = input.ndimension(); + + TORCH_CHECK(kT > 0 && kW > 0 && kH > 0, + "kernel size should be greater than zero, but got ", + "kT: ", kT, " kH: ", kH, " kW: ", kW); + TORCH_CHECK(dT > 0 && dW > 0 && dH > 0, + "stride should be greater than zero, but got ", + "dT: ", dT, " dH: ", dH, " dW: ", dW); + TORCH_CHECK(dilationT > 0 && dilationW > 0 && dilationH > 0, + "dilation should be greater than zero, but got ", + "dilationT: ", dilationT, " dilationH: ", dilationH, " dilationW: ", dilationW); + + TORCH_CHECK(ndim == 4 || ndim == 5, + fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes()); + + for (const auto i : c10::irange(ndim)) { + if (ndim == 5 && i == 0) { + // size of batch-dim can be 0. + continue; + } + TORCH_CHECK( + input.size(i) > 0, + fn_name, + ": Expected input's non-batch dimensions to have positive length," + " but input has a shape of ", + input.sizes(), + " and non-batch dimension ", + input.size(i), + " has length zero!") + } + + if (check_input_size) { // AveragePool3d + TORCH_CHECK(itime >= kT && iheight >= kH && iwidth >= kW, + "input image ", "(T: ", itime, " H: ", iheight, " W: ", iwidth, ") smaller than ", + "kernel size ", "(kT: ", kT, " kH: ", kH, " kW: ", kW, ")"); + } + + TORCH_CHECK(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, + "pad should be smaller than or equal to half of kernel size, but got " + "kT: ", kT, " kW: ", kW, " kH: ", kH, " padT: ", pT, " padW: ", pW, " padH: ", pH); + + TORCH_CHECK(otime >= 1 && owidth >= 1 && oheight >= 1, + "Given input size: (", + nslices,"x", itime, "x", iheight, "x", iwidth, "). ", + "Calculated output size: (", + nslices, "x", otime, "x", oheight, "x", owidth, "). ", + "Output size is too small"); +} + +inline void +max_pool3d_backward_shape_check( + const Tensor& input, + const Tensor& gradOutput, + const Tensor& indices, + int64_t nslices, + int kT, int kH, int kW, + int dT, int dH, int dW, + int pT, int pH, int pW, + int dilationT, int dilationH, int dilationW, + int64_t itime, int64_t iheight, int64_t iwidth, + int64_t otime, int64_t oheight, int64_t owidth, + const char* fn_name) +{ + const int64_t ndim = input.ndimension(); + + pool3d_shape_check( + input, + nslices, + kT, kH, kW, + dT, dH, dW, + pT, pH, pW, + dilationT, dilationH, dilationW, + itime, iheight, iwidth, + otime, oheight, owidth, fn_name); + + check_dim_size(gradOutput, ndim, ndim-4, nslices); + check_dim_size(gradOutput, ndim, ndim-3, otime); + check_dim_size(gradOutput, ndim, ndim-2, oheight); + check_dim_size(gradOutput, ndim, ndim-1, owidth); + + check_dim_size(indices, ndim, ndim-4, nslices); + check_dim_size(indices, ndim, ndim-3, otime); + check_dim_size(indices, ndim, ndim-2, oheight); + check_dim_size(indices, ndim, ndim-1, owidth); +} + +inline void +avg_pool3d_backward_shape_check( + const Tensor& input, + const Tensor& gradOutput, + int64_t nslices, + int kT, int kH, int kW, + int dT, int dH, int dW, + int pT, int pH, int pW, + int64_t itime, int64_t iheight, int64_t iwidth, + int64_t otime, int64_t oheight, int64_t owidth, + const char *fn_name) +{ + const int64_t ndim = input.ndimension(); + + pool3d_shape_check( + input, + nslices, + kT, kH, kW, + dT, dH, dW, + pT, pH, pW, + 1, 1, 1, + itime, iheight, iwidth, + otime, oheight, owidth, + fn_name, true); + + check_dim_size(gradOutput, ndim, ndim-4, nslices); + check_dim_size(gradOutput, ndim, ndim-3, otime); + check_dim_size(gradOutput, ndim, ndim-2, oheight); + check_dim_size(gradOutput, ndim, ndim-1, owidth); +} + +} // anonymous namespace + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Pow.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Pow.h new file mode 100644 index 0000000000000000000000000000000000000000..0d34f5638fe8f3a56f3e8e6df246ce6f691ed00b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Pow.h @@ -0,0 +1,74 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { + +struct TensorIterator; +struct TensorIteratorBase; + +namespace native { + +#if defined(__CUDACC__) || defined(__HIPCC__) +#define HOST_DEVICE __host__ __device__ +#else +#define HOST_DEVICE +#endif + +// integral power in pytorch allows for negative exponents, giving truncated integral results. +// e.g. since 2**-1==0.5, the truncated integral result is zero. 1**negative_exponent is the +// only non-zero result. +template , T>* = nullptr> +inline HOST_DEVICE __ubsan_ignore_signed_int_overflow__ T powi_impl(T a, T b) { + T result = 1; + while (b) { + if (b & 1) { + result *= a; + } + b /= 2; + a *= a; + } + return result; +} + +template && !std::is_signed_v, T>* = nullptr> +inline HOST_DEVICE T powi(T a, T b) { + return powi_impl(a, b); +} + +template && std::is_signed_v, T>* = nullptr> +inline HOST_DEVICE T powi(T a, T b) { + if ( b < 0 ) { + if ( a == 1 ) { + return 1; + } else if ( a == -1 ) { + auto negative = (-b) % static_cast(2); + return negative ? -1 : 1; + } else { + return 0; + } + } + return powi_impl(a, b); +} + +using pow_tensor_tensor_fn = void (*)(TensorIteratorBase&); +using pow_tensor_scalar_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); + +DECLARE_DISPATCH(pow_tensor_tensor_fn, pow_tensor_tensor_stub) +DECLARE_DISPATCH(pow_tensor_scalar_fn, pow_tensor_scalar_stub) + +} // namespace native + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/RNN.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/RNN.h new file mode 100644 index 0000000000000000000000000000000000000000..f8cb3f41ed2016e2755b3734677dd2543259b5a5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/RNN.h @@ -0,0 +1,58 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +using lstm_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool, bool); +using rnn_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool, bool); +using lstm_packed_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, const Tensor&, TensorList, TensorList, bool, int64_t, double, bool, bool); +using rnn_packed_fn = void(*)(Tensor&, Tensor&, const Tensor&, const Tensor&, const Tensor&, TensorList, bool, int64_t, double, bool, bool); + +DECLARE_DISPATCH(lstm_fn, lstm_cudnn_stub) +DECLARE_DISPATCH(lstm_fn, lstm_miopen_stub) +DECLARE_DISPATCH(lstm_fn, lstm_mkldnn_stub) +DECLARE_DISPATCH(rnn_fn, gru_cudnn_stub) +DECLARE_DISPATCH(rnn_fn, gru_miopen_stub) +DECLARE_DISPATCH(rnn_fn, rnn_tanh_cudnn_stub) +DECLARE_DISPATCH(rnn_fn, rnn_tanh_miopen_stub) +DECLARE_DISPATCH(rnn_fn, rnn_relu_cudnn_stub) +DECLARE_DISPATCH(rnn_fn, rnn_relu_miopen_stub) +DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_cudnn_stub) +DECLARE_DISPATCH(lstm_packed_fn, lstm_packed_miopen_stub) +DECLARE_DISPATCH(rnn_packed_fn, gru_packed_cudnn_stub) +DECLARE_DISPATCH(rnn_packed_fn, gru_packed_miopen_stub) +DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_cudnn_stub) +DECLARE_DISPATCH(rnn_packed_fn, rnn_tanh_packed_miopen_stub) +DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_cudnn_stub) +DECLARE_DISPATCH(rnn_packed_fn, rnn_relu_packed_miopen_stub) + +inline void check_attributes(const Tensor& input, const TensorList& params, const TensorList& hiddens, bool check_dtype=false) { + auto input_device = input.device(); + auto input_dtype = input.scalar_type(); + + auto check_tensors = [&](const std::string& name, const Tensor& t) { + if (!t.defined()) return; + auto t_device = t.device(); + TORCH_CHECK(input_device == t_device, + "Input and ", name, " tensors are not at the same device, found input tensor at ", + input_device, " and ", name, " tensor at ", t_device); + if (check_dtype) { + auto t_dtype = t.scalar_type(); + TORCH_CHECK(input_dtype == t_dtype, + "Input and ", name, " tensors are not the same dtype, found input tensor with ", + input_dtype, " and ", name, " tensor with ", t_dtype); + } + }; + + for (const auto& h : hiddens) check_tensors("hidden", h); + for (const auto& p : params) check_tensors("parameter", p); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/RangeFactories.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/RangeFactories.h new file mode 100644 index 0000000000000000000000000000000000000000..cf12e97e4f8933320a058b7e0d6b31cbba867c2a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/RangeFactories.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include + +namespace at { +struct TensorIterator; + +namespace native { + +DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, const Scalar&), arange_stub) +DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, int64_t), linspace_stub) + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/RangeUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/RangeUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..5b5e724e6b34b42fe3ab10cf685c93d64a31a768 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/RangeUtils.h @@ -0,0 +1,65 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include + + + +namespace at::native { + +inline void arange_check_bounds( + const c10::Scalar& start, + const c10::Scalar& end, + const c10::Scalar& step) { + // use double precision for validation to avoid precision issues + double dstart = start.to(); + double dend = end.to(); + double dstep = step.to(); + + TORCH_CHECK(dstep > 0 || dstep < 0, "step must be nonzero"); + TORCH_CHECK( + std::isfinite(dstart) && std::isfinite(dend), + "unsupported range: ", + dstart, + " -> ", + dend); + TORCH_CHECK( + ((dstep > 0) && (dend >= dstart)) || ((dstep < 0) && (dend <= dstart)), + "upper bound and lower bound inconsistent with step sign"); +} + +template +int64_t compute_arange_size(const Scalar& start, const Scalar& end, const Scalar& step) { + arange_check_bounds(start, end, step); + + // we use double precision for (start - end) / step + // to compute size_d for consistency across devices. + // The problem with using accscalar_t is that accscalar_t might be float32 on gpu for a float32 scalar_t, + // but double on cpu for the same, + // and the effective output size starts differing on CPU vs GPU because of precision issues, which + // we dont want. + // the corner-case we do want to take into account is int64_t, which has higher precision than double + double size_d; + if constexpr (std::is_same_v) { + using accscalar_t = at::acc_type; + auto xstart = start.to(); + auto xend = end.to(); + auto xstep = step.to(); + int64_t sgn = (xstep > 0) - (xstep < 0); + size_d = std::ceil((xend - xstart + xstep - sgn) / xstep); + } else { + size_d = std::ceil((end.to() - start.to()) + / step.to()); + } + + TORCH_CHECK(size_d >= 0 && size_d <= static_cast(std::numeric_limits::max()), + "invalid size, possible overflow?"); + + return static_cast(size_d); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReduceAllOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReduceAllOps.h new file mode 100644 index 0000000000000000000000000000000000000000..9c07a586c621fa9a1840ff25705d8f69fb7cf44f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReduceAllOps.h @@ -0,0 +1,21 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at { +class Tensor; +} + +namespace at::native { + +using reduce_all_fn = void (*)(Tensor & result, const Tensor & self); +using reduce_min_max_fn = void (*)(Tensor & max_result, Tensor & min_result, const Tensor & self); +DECLARE_DISPATCH(reduce_all_fn, min_all_stub) +DECLARE_DISPATCH(reduce_all_fn, max_all_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReduceOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReduceOps.h new file mode 100644 index 0000000000000000000000000000000000000000..0b5fe2800e2da9b761de750e4aaab8855903566a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReduceOps.h @@ -0,0 +1,62 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace c10 { +class Scalar; +} + +namespace at { +struct TensorIterator; +class Tensor; +} + +namespace at::native { + +using reduce_fn = void(*)(TensorIterator &); + +DECLARE_DISPATCH(reduce_fn, sum_stub) +DECLARE_DISPATCH(reduce_fn, nansum_stub) +DECLARE_DISPATCH(reduce_fn, prod_stub) +DECLARE_DISPATCH(reduce_fn, mean_stub) +DECLARE_DISPATCH(reduce_fn, and_stub) +DECLARE_DISPATCH(reduce_fn, or_stub) +DECLARE_DISPATCH(reduce_fn, min_values_stub) +DECLARE_DISPATCH(reduce_fn, max_values_stub) +DECLARE_DISPATCH(reduce_fn, argmax_stub) +DECLARE_DISPATCH(reduce_fn, argmin_stub) +DECLARE_DISPATCH(reduce_fn, xor_sum_stub) + +using reduce_std_var_function = + void (*)(TensorIterator&, double correction, bool take_sqrt); +DECLARE_DISPATCH(reduce_std_var_function, std_var_stub) + +using reduce_norm_fn = + void (*)(Tensor&, const Tensor&, const c10::Scalar&, std::optional); +DECLARE_DISPATCH(reduce_norm_fn, norm_kernel) + +using reduce_fn_flag = void(*)(TensorIterator &, const c10::Scalar&); +DECLARE_DISPATCH(reduce_fn_flag, norm_stub) + +using structured_cum_fn = void (*)(const Tensor&, const Tensor&, int64_t); +using cum_fn = void (*)(Tensor&, const Tensor&, int64_t); +DECLARE_DISPATCH(structured_cum_fn, cumsum_stub) +DECLARE_DISPATCH(structured_cum_fn, cumprod_stub) +DECLARE_DISPATCH(cum_fn, logcumsumexp_stub) + +DECLARE_DISPATCH(void (*)(const Tensor&, int64_t, bool, Tensor&, Tensor&), aminmax_stub) +DECLARE_DISPATCH(void (*)(const Tensor&, Tensor&, Tensor&), aminmax_allreduce_stub) + +// Used in cuda/Normalization.cu +TORCH_API std::tuple var_mean_out( + Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, + int64_t correction, bool keepdim); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReduceOpsUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReduceOpsUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..8dff19ab88c6cf2b203212a8087028af42756f58 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReduceOpsUtils.h @@ -0,0 +1,473 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + +namespace at::native { + +// Maximum and minimum possible scalar values, including infinities +template +constexpr scalar_t upper_bound() { + using lim = std::numeric_limits; + return lim::has_infinity ? lim::infinity() : lim::max(); +} + +template +constexpr scalar_t lower_bound() { + using lim = std::numeric_limits; + return lim::has_infinity ? -lim::infinity() : lim::lowest(); +} + +inline Tensor restride_dim( + const Tensor& src, int64_t dim, + IntArrayRef replacement_shape +) { + auto strides = ensure_nonempty_vec(src.strides().vec()); + strides[dim] = 0; + return src.as_strided(replacement_shape, strides); +} + +inline void _dimreduce_setup(const Tensor &result, const Tensor &self, + int64_t dim) { + IntArrayRef self_sizes = self.sizes(); + std::vector result_sizes; + result_sizes.insert(result_sizes.end(), self_sizes.begin(), self_sizes.end()); + result_sizes[dim] = 1; + result.resize_(result_sizes); +} + +inline bool _dimreduce_return_trivial(const Tensor &result, const Tensor &self, + const Scalar& ident, int64_t dim, bool keepdim) { + if (self.numel() == 1 && self.ndimension() == 0) { + result.resize_({}); + result.fill_(self); + return true; + } + // Return identity + if (self.numel() == 0) { + _dimreduce_setup(result, self, dim); + result.fill_(ident); + if (!keepdim) result.squeeze_(dim); + return true; + } + return false; +} + +inline bool _dimreduce_return_trivial_no_ident(Tensor &result, const Tensor &self, + int64_t /*dim*/, bool /*keepdim*/, const char* /*fn_name*/) { + if (self.numel() == 1 && self.ndimension() == 0) { + result.resize_({}); + result.fill_(self); + return true; + } + + return false; +} + +inline std::optional _allreduce_return_trivial( + const Tensor& self, + const Scalar& ident) { + // Return identity + if (self.numel() == 0) { + return at::scalar_tensor(ident, self.options()); + } + return std::nullopt; +} + +#define OPTION_TYPE_EQUALITY_CHECK(option, out, self) \ +{ \ + TORCH_CHECK(\ + out.option() == self.option(),\ + "expected ", #option, " ",\ + self.option(),\ + " but found ", out.option())\ +} + +inline void check_scalar_type_device_layout_equal(const Tensor& out, const Tensor& self) { + OPTION_TYPE_EQUALITY_CHECK(scalar_type, out, self); + OPTION_TYPE_EQUALITY_CHECK(device, out.options(), self.options()); + OPTION_TYPE_EQUALITY_CHECK(layout, out.options(), self.options()); +} + +inline Tensor integer_upcast(const Tensor& self, std::optional dtype) { + ScalarType scalarType = self.scalar_type(); + TORCH_CHECK(!isBarebonesUnsignedType(scalarType), "integer upcasting for uint16, uint32 and uint64 is not currently implemented"); + ScalarType upcast_scalarType = dtype.value_or(at::isIntegralType(scalarType, /*includeBool=*/true) ? ScalarType::Long : scalarType); + return self.toType(upcast_scalarType); +} + +using DimMask = TensorIterator::DimMask; + +inline DimVector make_dim_vector(OptionalIntArrayRef opt_dims, int64_t ndim) { + if (opt_dims.has_value()) { + return DimVector(opt_dims.value()); + } else { + std::vector all_dims(ndim); + std::iota(all_dims.begin(), all_dims.end(), 0); + return DimVector(all_dims); + } +} + +inline DimMask make_dim_mask(OptionalIntArrayRef opt_dims, int64_t ndim, bool allow_empty_dims=false) { + DimMask mask; + if (opt_dims.has_value()) { + auto dims = opt_dims.value(); + if (dims.empty() && !allow_empty_dims) { + mask = DimMask().flip(); + } else { + mask = at::dim_list_to_bitset(dims, ndim); + } + } else { + mask = DimMask().flip(); + } + return mask; +} + +inline DimVector shape_from_dim_mask(const Tensor& self, DimMask mask, bool keepdim) { + auto shape = DimVector(self.sizes()); + for (int dim = shape.size() - 1; dim >= 0; dim--) { + if (mask[dim]) { + if (keepdim) { + shape[dim] = 1; + } else { + shape.erase(shape.begin() + dim); + } + } + } + return shape; +} + +inline void resize_reduction_result( + Tensor& result, const Tensor& self, DimMask mask, bool keepdim, + ScalarType /*dtype*/) +{ + auto shape = shape_from_dim_mask(self, mask, keepdim); + TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor."); + at::native::resize_output(result, shape); +} + +inline Tensor create_reduction_result( + const Tensor& self, at::OptionalIntArrayRef dim, bool keepdim, ScalarType dtype +) { + DimMask mask = make_dim_mask(dim, self.dim()); + auto shape = shape_from_dim_mask(self, mask, keepdim); + return at::empty(shape, self.options().dtype(dtype)); +} + +inline Tensor review_reduce_result(const Tensor& result, int ndim, DimMask mask, bool keepdim) { + if (keepdim) { + return result; + } + auto shape = DimVector(result.sizes()); + auto stride = DimVector(result.strides()); + for (const auto dim : c10::irange(ndim)) { + if (mask[dim]) { + shape.insert(shape.begin() + dim, 1); + stride.insert(stride.begin() + dim, 0); + } + } + return result.as_strided(shape, stride); +} + +inline TensorIterator make_reduction( + const char* name, Tensor& result, const Tensor& self, + at::OptionalIntArrayRef dim_opt, + bool keepdim, ScalarType in_dtype, ScalarType out_dtype) { + // check that result type and dtype match if provided + TORCH_CHECK( + !result.defined() || result.scalar_type() == out_dtype, + name, ": provided dtype must match dtype of result. Got ", + toString(result.scalar_type()), + " and ", + toString(out_dtype), + "."); + // dim={} performs an all-reduce, same as dim=None + IntArrayRef dim = dim_opt.value_or(IntArrayRef{}); + int64_t ndim = self.dim(); + auto mask = make_dim_mask(dim, ndim); + resize_reduction_result(result, self, mask, keepdim, out_dtype); + auto viewed_result = review_reduce_result(result, ndim, mask, keepdim); + namedinference::propagate_names_for_reduction(result, self, dim, keepdim); + if (self.scalar_type() == in_dtype) { + return TensorIterator::reduce_op(viewed_result, self); + } + return TensorIterator::reduce_op(viewed_result, self.to(in_dtype)); +} + +[[maybe_unused]] inline TensorIterator make_reduction( + const char* name, + Tensor& result, + const Tensor& self, + at::OptionalIntArrayRef dim, + bool keepdim, + ScalarType out_dtype) { + // special case for type promotion in mixed precision, improves computational + // efficiency. + // not generalize this to common mismatched input/output types to avoid cross + // product of templated kernel launches. + const bool gpu_lowp_to_f32 = ( + (self.is_cuda() || self.is_xpu()) && (self.scalar_type() == kHalf || self.scalar_type() == kBFloat16) && out_dtype == kFloat); + auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type() + : self.is_complex() ? c10::toComplexType(out_dtype) + : out_dtype; + return make_reduction(name, result, self, dim, keepdim, in_dtype, out_dtype); +} + +inline TensorIterator make_reduction( + const char* name, Tensor& result1, Tensor& result2, const Tensor& self, + at::OptionalIntArrayRef dim_opt, bool keepdim, ScalarType dtype1, + ScalarType dtype2) { + // check that result type and dtype match if provided + TORCH_CHECK( + (!result1.defined() || result1.scalar_type() == dtype1) && (!result2.defined() || result2.scalar_type() == dtype2), + name, ": provided dtype must match dtype of result. Got ", + toString(result1.scalar_type()), toString(result2.scalar_type()), + " and ", + toString(dtype1), toString(dtype2), + "."); + + // dim={} performs an all-reduce, same as dim=None + auto dim = dim_opt.value_or(IntArrayRef{}); + int64_t ndim = self.dim(); + DimMask mask = make_dim_mask(dim, ndim); + resize_reduction_result(result1, self, mask, keepdim, dtype1); + auto viewed_result1 = review_reduce_result(result1, ndim, mask, keepdim); + + resize_reduction_result(result2, self, mask, keepdim, dtype2); + auto viewed_result2 = review_reduce_result(result2, ndim, mask, keepdim); + + namedinference::propagate_names_for_reduction(result1, self, dim, keepdim); + namedinference::propagate_names_for_reduction(result2, self, dim, keepdim); + + // special case for type promotion in mixed precision, improves computational + // efficiency. + // We don't generalize this to common mismatched input/output types to avoid cross + // product of templated kernel launches. + if (self.scalar_type() == dtype1 || + (self.is_cuda() && self.scalar_type() == kHalf && dtype1 == kFloat)) { + return TensorIterator::reduce_op(viewed_result1, viewed_result2, self); + } + return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1)); +} + +[[maybe_unused]] inline TensorIterator make_reduction( + const char* name, + Tensor& result1, + Tensor& result2, + const Tensor& self, + at::OptionalIntArrayRef dim, + bool keepdim, + ScalarType dtype) { + return make_reduction(name, result1, result2, self, dim, keepdim, dtype, dtype); +} + +inline void zero_numel_check_dims(const Tensor& self, const int64_t dim, const char *fn_name) { + if (self.ndimension() == 0) { + TORCH_CHECK_INDEX(dim == 0 || dim == -1, fn_name, + ": Expected reduction dim -1 or 0 for scalar but got ", dim); + } + else { + TORCH_CHECK_INDEX(self.size(dim) != 0, fn_name, + ": Expected reduction dim ", dim, " to have non-zero size."); + } +} + +inline void zero_numel_check_dims(const Tensor& self, const IntArrayRef dim, const char *fn_name) { + TORCH_CHECK( + !dim.empty(), + fn_name, ": Expected reduction dim to be specified for input.numel() == 0. ", + "Specify the reduction dim with the 'dim' argument."); + for (const int64_t d : dim) { + zero_numel_check_dims(self, d, fn_name); + } +} + +inline std::vector get_zero_numel_tensor_size( + const Tensor& self, + const int64_t dim, + const bool keepdim, + const char* fn_name) { + TORCH_INTERNAL_ASSERT(self.numel() == 0, fn_name, ": Expected self.numel() == 0."); + zero_numel_check_dims(self, dim, fn_name); + std::vector sizes; + if (keepdim) { + sizes = self.sizes().vec(); + sizes[dim] = 1; + } + else { + for (const auto d : c10::irange(self.dim())) { + if (d != dim) { + sizes.push_back(self.sizes()[d]); + } + } + } + return sizes; +} + +// Resize the result tensor and indices when result.numel() == 0 depending on values of +// dim and keepdim for returning tensors containing reduction results. +// This function should be called when you are reducing a zero-numel tensor and want to +// resize the output and return it. This function exists for resizing zero-numel +// tensors when the size of the reduction dimension is non-zero. +[[maybe_unused]] inline void zero_numel_tensor_resize( + Tensor& result, + Tensor& result_indices, + const Tensor& self, + const int64_t dim, + const bool keepdim, + const char* fn_name) { + auto sizes = get_zero_numel_tensor_size(self, dim, keepdim, fn_name); + at::native::resize_output(result, sizes); + at::native::resize_output(result_indices, sizes); +} + +inline ScalarType get_dtype_from_self( + const Tensor& self, + const std::optional& dtype, + bool promote_integers) { + if (dtype.has_value()) { + return dtype.value(); + } + ScalarType src_type = self.scalar_type(); + if (promote_integers && at::isIntegralType(src_type, /*includeBool=*/true)) { + return kLong; + } + return src_type; +} + +inline ScalarType get_dtype_from_result(Tensor& result, std::optional dtype) { + TORCH_CHECK(result.defined(), "Cannot create a new tensor inside a reduction op. You likely tried to call an operator with an out argument but the out argument was an undefined tensor."); + if (dtype.has_value()) { + return dtype.value(); + } else { + return result.scalar_type(); + } +} + + +} // namespace at::native + +namespace at::meta { + +[[maybe_unused]] inline DimVector get_reduction_shape( + const Tensor& self, + IntArrayRef dims, + bool keepdim, + bool allow_empty_dims = false) { + auto mask = native::make_dim_mask(dims, self.dim(), allow_empty_dims); + return native::shape_from_dim_mask(self, mask, keepdim); +} + +inline void resize_reduction( + impl::MetaBase& meta, + const Tensor& self, + OptionalIntArrayRef opt_dims, + bool keepdim, + ScalarType out_dtype, + bool allow_empty_dims=false) { + DimVector dims_ = at::native::make_dim_vector(opt_dims, self.dim()); + maybe_wrap_dims(dims_, self.dim()); + auto shape = get_reduction_shape(self, dims_, keepdim, allow_empty_dims); + if (self.layout() == kStrided) { + meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype)); + } else if (shape.empty()) { + meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype).layout(kStrided)); + } else { + TORCH_CHECK(false, "resize_reduction: support for output with ", self.layout(), " layout is not implemented yet"); + } + namedinference::propagate_names_for_reduction( + meta.maybe_get_output(), self, dims_, keepdim); +} + +inline void resize_reduction_with_indices( + impl::MetaBase& meta, + const Tensor& self, + IntArrayRef dims, + bool keepdim, + ScalarType out_dtype) { + DimVector dims_(dims); + maybe_wrap_dims(dims_, self.dim()); + auto shape = get_reduction_shape(self, dims_, keepdim); + meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype)); + meta.set_output_raw_strided(1, shape, {}, self.options().dtype(kLong)); + namedinference::propagate_names_for_reduction( + meta.maybe_get_output(0), self, dims_, keepdim); + namedinference::propagate_names_for_reduction( + meta.maybe_get_output(1), self, dims_, keepdim); +} + +inline TensorIterator make_reduction( + const Tensor& self, + const Tensor& result, + OptionalIntArrayRef opt_dims, + bool keepdim, + ScalarType in_dtype) { + int64_t ndim = self.dim(); + auto mask = at::native::make_dim_mask(opt_dims, ndim); + auto viewed_result = + at::native::review_reduce_result(result, ndim, mask, keepdim); + if (self.scalar_type() == in_dtype) { + return TensorIterator::reduce_op(viewed_result, self); + } + return TensorIterator::reduce_op(viewed_result, self.to(in_dtype)); +} + +inline TensorIterator make_reduction( + const Tensor& self, + const Tensor& result1, + const Tensor& result2, + IntArrayRef dims, + bool keepdim, + ScalarType dtype1, + ScalarType /*dtype2*/) { + int64_t ndim = self.dim(); + auto mask = at::native::make_dim_mask(dims, ndim); + auto viewed_result1 = at::native::review_reduce_result(result1, ndim, mask, keepdim); + auto viewed_result2 = at::native::review_reduce_result(result2, ndim, mask, keepdim); + // special case for type promotion in mixed precision, improves computational efficiency. + // We don't generalize this to common mismatched input/output types to avoid cross product + // of templated kernel launches. + if (self.scalar_type() == dtype1 || + (self.is_cuda() && self.scalar_type() == kHalf && dtype1 == kFloat)) { + return TensorIterator::reduce_op(viewed_result1, viewed_result2, self); + } + return TensorIterator::reduce_op(viewed_result1, viewed_result2, self.to(dtype1)); +} + +[[maybe_unused]] inline TensorIterator make_reduction_from_out_ty( + const Tensor& self, + const Tensor& result, + OptionalIntArrayRef opt_dims, + bool keepdim, + ScalarType out_dtype) { + // special case for type promotion in mixed precision, improves computational + // efficiency. + // not generalize this to common mismatched input/output types to avoid cross + // product of templated kernel launches. + const bool gpu_lowp_to_f32 = + (self.is_cuda() && + (self.scalar_type() == kHalf || self.scalar_type() == kBFloat16) && + out_dtype == kFloat); + auto in_dtype = gpu_lowp_to_f32 ? self.scalar_type() : out_dtype; + return make_reduction(self, result, opt_dims, keepdim, in_dtype); +} + +} // namespace at::meta + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReductionType.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReductionType.h new file mode 100644 index 0000000000000000000000000000000000000000..31f6dffa592c60dc8bd07ecc85dc5660c29e0bc6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ReductionType.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +enum class ReductionType {MAX, MEAN, MIN, SUM, PROD}; + +inline ReductionType get_reduction_enum(const std::string_view& reduce) { + if (reduce == "max" || reduce == "amax") { + return ReductionType::MAX; + } else if (reduce == "mean") { + return ReductionType::MEAN; + } else if (reduce == "min" || reduce == "amin") { + return ReductionType::MIN; + } else if (reduce == "sum") { + return ReductionType::SUM; + } else if (reduce == "prod") { + return ReductionType::PROD; + } else { + TORCH_CHECK(false, "reduce argument must be either sum, prod, mean, amax or amin, got ", reduce); + } +} + +// used for `scatter_reduce`, old options for BC. +inline ReductionType get_operator_enum(const std::string_view reduce, bool use_new_options) { + if (use_new_options) { + return get_reduction_enum(reduce); + } else { + if (reduce == "add") { + return ReductionType::SUM; + } else if (reduce == "multiply") { + return ReductionType::PROD; + } else { + TORCH_CHECK(false, "reduce argument must be either add or multiply.") + } + } +} + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Repeat.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Repeat.h new file mode 100644 index 0000000000000000000000000000000000000000..1f4f80a76d0a1b93be14296a15ec63b7ce2d5a91 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Repeat.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + +namespace at::native { + +template < + typename index_t, + void compute(const index_t*, const int64_t*, index_t*, int64_t, int64_t)> +static inline Tensor repeat_interleave_common( + const Tensor& repeats, + std::optional output_size) { + TORCH_CHECK( + repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat"); + TORCH_CHECK( + repeats.scalar_type() == at::kLong || repeats.scalar_type() == at::kInt, + "repeats has to be Long or Int tensor"); + if (repeats.size(0) == 0) { + return at::empty_like(repeats, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + Tensor repeats_ = repeats.contiguous(); + Tensor cumsum = repeats.cumsum(0); + int64_t total = 0; + if (output_size.has_value()) { + total = output_size.value(); + } else { + total = cumsum[-1].item(); + TORCH_CHECK( + (repeats >= 0).all().item(), "repeats can not be negative"); + } + + Tensor result = at::empty({total}, repeats.options()); + const index_t* repeat_ptr = repeats_.const_data_ptr(); + const int64_t* cumsum_ptr = cumsum.const_data_ptr(); + index_t* result_ptr = result.data_ptr(); + compute(repeat_ptr, cumsum_ptr, result_ptr, repeats.size(0), total); + return result; +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Resize.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Resize.h new file mode 100644 index 0000000000000000000000000000000000000000..d3af99da67372aec3597b5dda7c5c0df14545884 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Resize.h @@ -0,0 +1,210 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include + + +namespace at::native { + +// TODO: make all operations that resize given outputs use this function +// for consistency and maintainability. +// Some operations like `cat` might not be able to make the use of +// resize_output directly. For more details to understand how it works in `cat`, +// see https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362 +// Resizes outputs +// Functions accepting output tensors, like with the "out" kwarg, should +// call this function to handle resizing their output tensor. +// Issues a warning if the output tensor has one or more elements and +// needs resizing +// NOTE: In the future the warning will become an error +// Returns a bool saying whether or not the resize actually happened or not +TORCH_API bool resize_output(const Tensor& output, IntArrayRef shape); +// WARNING: Do NOT call this directly. If you are resizing an output and want +// to support dynamic shapes call at::resize__symint and resize_output_check_symint. +// For more details, see: https://github.com/pytorch/pytorch/pull/111530/files#r1365845272 +TORCH_API bool resize_output_symint(const Tensor& output, SymIntArrayRef shape); + +// Utility for resize_output +// Returns a bool saying resize should happen or not and +// raises a warning if resizing for one or more elements +TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape); +TORCH_API bool resize_output_check_symint(const Tensor& output, SymIntArrayRef shape); + +TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes); +TORCH_API void resize_bytes_meta(StorageImpl* storage, c10::SymInt size_bytes); +TORCH_API void resize_bytes_nocuda(const Storage& storage, const c10::SymInt& size_bytes); + +inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) { + // It does not make sense to try to resize a storage + // to hold 0 elements, and this can break + // if storage_offset is positive but + // new_size is 0, so just bail in that case + // (same comment is in cuda/Resize.h) + if (self->numel() == 0) { + return; + } + + const Storage& storage = self->unsafe_storage(); + if (!storage) { + auto new_storage = c10::make_intrusive( + StorageImpl::use_byte_size_t(), + new_size_bytes, + c10::GetCPUAllocator(), + true); + self->set_storage_keep_dtype(std::move(new_storage)); + } else if (new_size_bytes > storage.nbytes()) { + resize_bytes_cpu(storage.unsafeGetStorageImpl(), new_size_bytes); + } +} + +TORCH_API TensorImpl* resize_impl_cpu_( + TensorImpl* self, + IntArrayRef size, + at::OptionalIntArrayRef stride, + bool resize_storage = true); + +template +T maybe_convert_symint(c10::SymInt) = delete; + +template <> +inline c10::SymInt maybe_convert_symint(c10::SymInt x) { return x; } + +template <> +inline int64_t maybe_convert_symint(c10::SymInt x) { return x.guard_int(__FILE__, __LINE__); } + +template +inline void checkInBoundsForStorage( + ArrayRef size, + ArrayRef stride, + T storage_offset, + const caffe2::TypeMeta& data_type, + const Storage& new_storage) { + T storage_size_bytes, storage_size_plus_offset_bytes; + if (stride.data()) { + storage_size_bytes = + at::detail::computeStorageNbytes(size, stride, data_type.itemsize()); + storage_size_plus_offset_bytes = at::detail::computeStorageNbytes( + size, stride, data_type.itemsize(), storage_offset); + } else { + storage_size_bytes = + at::detail::computeStorageNbytesContiguous(size, data_type.itemsize()); + storage_size_plus_offset_bytes = at::detail::computeStorageNbytesContiguous( + size, data_type.itemsize(), storage_offset); + } + // It's ok to always evaluate to False for this early return for SymInts because + // (1) maybe_convert_symint below only installs guard for int64_t case + // (2) we check for this condition in the TORCH_MAYBE_SYM_CHECK below + if (TORCH_GUARD_OR_FALSE(sym_eq(storage_size_bytes, 0))) { + // NB: (a tensor with arbitrary 0 dims)'s storage can have any numel. + return; + } + T new_storage_size_bytes = maybe_convert_symint(new_storage.sym_nbytes()); + TORCH_MAYBE_SYM_CHECK( + sym_eq(storage_size_bytes, 0) || sym_le(storage_size_plus_offset_bytes, new_storage_size_bytes), + "setStorage: sizes ", + size, + ", strides ", + stride, + "," + " storage offset ", + storage_offset, + ", and itemsize ", + data_type.itemsize(), + " requiring a storage size of ", + storage_size_plus_offset_bytes, + " are out of bounds for storage of size ", + new_storage_size_bytes); +} + +template +inline void checkSetStorage(Tensor& result, Storage storage, T storage_offset, + ArrayRef size, ArrayRef stride, bool check_offset_in_bounds = true) { + // FIXME: stride should be optional + if (stride.data()) { + TORCH_CHECK(size.size() == stride.size(), "unequal size length (", size.size(), + ") and stride length (", stride.size(), ")"); + } + +#ifdef DEBUG + TORCH_CHECK(size.size() <= INT_MAX, "size length (", size.size(), ") greater than INT_MAX"); +#endif + + // storageOffset + TORCH_CHECK( + TORCH_GUARD_OR_TRUE(sym_ge(storage_offset, 0)), "Tensor: invalid storage offset ", storage_offset); + + // set_storage_{device} (except set_storage_meta__symint) + // will (unsafely) set the storage offset and then call resize_impl that + // handles resizing the storage However, resize_impl will only resize the + // storage if the sizes/strides changed. For the case that the sizes/strides + // remain unchanged, the storage offset is not properly validated, so we do + // that here. + if (check_offset_in_bounds) { + auto result_tensor_impl = result.unsafeGetTensorImpl(); + bool size_unchanged = result_tensor_impl->generic_sizes() == size; + bool stride_unchanged = stride.data() + ? result_tensor_impl->generic_strides() == stride + : true; + if (size_unchanged && stride_unchanged) { + checkInBoundsForStorage( + size, stride, storage_offset, result.dtype(), storage); + } + } + + // storage: note this can't be replaced with result.set_(storage) as the semantics of that + // function is to set the tensor size to be equal to the size of the storage. + if (!result.storage().is_alias_of(storage)) { + // Caffe2 might have tensors whose storages are null, but we + // don't allow it in PyTorch. + TORCH_INTERNAL_ASSERT(storage); + TORCH_INTERNAL_ASSERT(result.storage()); + + // We used to allow this, but this breaks device caching. + // Let's put an actual error message for this one. + TORCH_CHECK(result.storage().device() == storage.device(), + "Attempted to set the storage of a tensor on device \"", result.storage().device(), + "\" to a storage on different device \"", storage.device(), + "\". This is no longer allowed; the devices must match."); + result.unsafeGetTensorImpl()->set_storage_keep_dtype(std::move(storage)); + } +} + +/** + * Set self's sizes, strides, and storage_offset. + * (size, stride, storage_offset) must be in bounds for self's storage. + */ +template +inline void setStrided( + const Tensor& self, + ArrayRef size, + ArrayRef stride, + T storage_offset) { + TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape"); + for (const auto& val : stride) { + TORCH_CHECK(val >= 0, + "as_strided: Negative strides are not supported at the moment, " + "got strides: ", stride); + } + + auto* self_ = self.unsafeGetTensorImpl(); + checkInBoundsForStorage( + size, stride, storage_offset, self_->dtype(), self_->storage()); + + /* storage offset */ + TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset); + self_->set_sizes_and_strides(size, stride, storage_offset); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ResizeCommon.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ResizeCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..ccedbed59e28b2bb2774b8a06ec490e903b36ce7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ResizeCommon.h @@ -0,0 +1,80 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +template +inline T storage_size_for(ArrayRef size, ArrayRef stride) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(size.size() == stride.size(), + "storage_size_for(size, stride) requires that size and stride ", + "have the same size as a precondition."); + T storage_size = 1; + for (const auto dim : c10::irange(size.size())) { + if (size[dim] == 0) { + storage_size = 0; + break; + } + storage_size += (size[dim] - 1) * stride[dim]; + } + return storage_size; +} + +inline const Tensor& resize_named_tensor_( + const Tensor& self, + IntArrayRef size, + std::optional optional_memory_format) { + TORCH_INTERNAL_ASSERT(self.has_names()); + TORCH_CHECK( + self.sizes() == size, + "Cannot resize named tensor with resize_ or resize_as_ (tried to resize " + "Tensor", + self.names(), + " with size ", + self.sizes(), + " to ", + size, + "). This may be caused by passing a named tensor ", + "as an `out=` argument; please ensure that the sizes are the same. "); + TORCH_CHECK( + !optional_memory_format.has_value(), + "Unsupported memory format for named tensor resize ", + optional_memory_format.value()); + return self; +} + +// For deterministic output, fill new elements that were added after a storage +// resize with NaN or MAX_INT. `old_storage_nbytes` is the size of the storage +// before the resize happened. +inline const Tensor& fill_resize_deterministic_(const Tensor& tensor, int64_t old_storage_nbytes) { + const at::Storage& storage = tensor.unsafeGetTensorImpl()->unsafe_storage(); + int64_t new_storage_nbytes = storage.nbytes(); + int64_t old_storage_numel = old_storage_nbytes / tensor.itemsize(); + int64_t new_storage_numel = new_storage_nbytes / tensor.itemsize(); + if (new_storage_numel > old_storage_numel) { + at::Tensor tensor_view = at::empty({}, at::TensorOptions().dtype(tensor.scalar_type()).device(tensor.device())); + tensor_view.set_( + storage, + /*storage_offset=*/old_storage_numel, + /*size=*/{new_storage_numel - old_storage_numel}, + /*stride=*/{1}); + at::native::fill_empty_deterministic_(tensor_view); + } + return tensor; +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ScatterGatherChecks.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ScatterGatherChecks.h new file mode 100644 index 0000000000000000000000000000000000000000..9d7e4c319a4e6500947c85143fd20fe66e7380ec --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/ScatterGatherChecks.h @@ -0,0 +1,133 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +namespace { + +// checks whether index.dtype == int64 +// and self.dtype == src.dtype if src is a Tensor +inline void scatter_gather_dtype_check( + const std::string& method_name, + const Tensor& self, + const Tensor& index, + const std::optional& src_opt = std::nullopt +) { + if (index.numel() != 0) { + TORCH_CHECK( + index.scalar_type() == at::ScalarType::Long || index.scalar_type() == at::ScalarType::Int, + method_name, "(): Expected dtype int32/int64 for index" + ); + } + + if (src_opt.has_value()) { + const auto& src = src_opt.value(); + TORCH_CHECK( + self.scalar_type() == src.scalar_type(), + method_name, "(): Expected self.dtype to be equal to src.dtype" + ); + } +} + +// Used for `gather`-like methods +// Note: self means the input tensor here +// Test: +// 1. index.size(d) <= self.size(d) for all d != dim +// 2. index.dim() == self.dim() +inline void gather_shape_check(const Tensor& self, int64_t dim, + const Tensor& index +) { + auto self_dims = ensure_nonempty_dim(self.dim()); + TORCH_CHECK(self_dims == ensure_nonempty_dim(index.dim()), + "Index tensor must have the same number of dimensions as input tensor" + ); + + for (const auto i : c10::irange(self_dims)) { + if (i != dim) { + TORCH_CHECK( + ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i), + "Size does not match at dimension ", i, + " expected index ", index.sizes(), + " to be no larger than self ", self.sizes(), + " apart from dimension ", dim + ); + } + } +} + +// Used for `scatter` and `scatter_add` +// Tests: +// 1. index.size(d) <= self.size(d) for all d != dim +// 2. index.size(d) <= src.size(d) for all d if src is a Tensor +// 3. index.dim() == self.dim() == src.dim() +inline void scatter_shape_check( + const Tensor& self, int64_t dim, const Tensor& index, + const std::optional& src_opt = std::nullopt +) { + if (index.numel() == 0) return; + TORCH_CHECK( + ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()), + "Index tensor must have the same number of dimensions as self tensor" + ); + + bool is_wrong_shape = false; + int64_t self_dims = ensure_nonempty_dim(self.dim()); + + // Check: index.size(d) <= self.size(d) for all d != dim + for (const auto d : c10::irange(self_dims)) { + int64_t index_d_size = ensure_nonempty_size(index, d); + if (d == dim) continue; + if (index_d_size > ensure_nonempty_size(self, d)) { + is_wrong_shape = true; + break; + } + } + + // Check: index.size(d) <= src.size(d) for all d if src is Tensor + if (!is_wrong_shape && src_opt.has_value()) { + const auto& src = src_opt.value(); + for (const auto d : c10::irange(self_dims)) { + int64_t index_d_size = ensure_nonempty_size(index, d); + if (index_d_size > ensure_nonempty_size(src, d)) { + is_wrong_shape = true; + break; + } + } + } + + if (src_opt.has_value()) { + const auto& src = src_opt.value(); + + TORCH_CHECK( + ensure_nonempty_dim(src.dim()) == ensure_nonempty_dim(index.dim()), + "Index tensor must have the same number of dimensions as src tensor" + ); + + TORCH_CHECK(!is_wrong_shape, + "Expected index ", index.sizes(), + " to be no larger than self ", self.sizes(), + " apart from dimension ", dim, + " and to be no larger size than src ", src.sizes() + ); + } + else { + TORCH_CHECK(!is_wrong_shape, + "Expected index ", index.sizes(), + " to be no larger than self ", self.sizes(), + " apart from dimension ", dim + ); + } +} + +} // anonymous namespace + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SegmentReduce.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SegmentReduce.h new file mode 100644 index 0000000000000000000000000000000000000000..add144d19e2a0c4872304c935a320d4a11b26f27 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SegmentReduce.h @@ -0,0 +1,55 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at { +class Tensor; + +namespace native { + +using segment_reduce_lengths_fn = Tensor (*)( + ReductionType, + const Tensor&, + const Tensor&, + int64_t, + const std::optional&); +DECLARE_DISPATCH(segment_reduce_lengths_fn, _segment_reduce_lengths_stub) + +using segment_reduce_offsets_fn = Tensor (*)( + ReductionType, + const Tensor&, + const Tensor&, + int64_t, + const std::optional&); +DECLARE_DISPATCH(segment_reduce_offsets_fn, _segment_reduce_offsets_stub) + +using segment_reduce_lengths_backward_fn = Tensor (*)( + const Tensor&, + const Tensor&, + const Tensor&, + ReductionType, + const Tensor&, + int64_t, + const std::optional&); +DECLARE_DISPATCH(segment_reduce_lengths_backward_fn, _segment_reduce_lengths_backward_stub) + +using segment_reduce_offsets_backward_fn = Tensor (*)( + const Tensor&, + const Tensor&, + const Tensor&, + ReductionType, + const Tensor&, + int64_t, + const std::optional&); +DECLARE_DISPATCH(segment_reduce_offsets_backward_fn, _segment_reduce_offsets_backward_stub) + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SharedReduceOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SharedReduceOps.h new file mode 100644 index 0000000000000000000000000000000000000000..56e4db1423183f5049a21d2008dd9c8c5f147fa0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SharedReduceOps.h @@ -0,0 +1,550 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// Please note that this file is +// used across both CPU and GPU. + +#include +#include +#include +#include +#include +#include +#if defined(__CUDACC__) +#include +#include +#elif defined(__HIPCC__) +#include +#include +#endif +#if defined(__CUDACC__) || defined(__HIPCC__) +#include +#else +#include +#define device_sqrt std::sqrt +#endif +#if defined(__CUDACC__) || defined(__HIPCC__) +template +inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) { +#if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm/hip/issues/2209 + scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b)); +#else + scalar_t max = at::_isnan(b) ? b : std::max(a, b); +#endif + return max; +} +template +inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) { +#if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm/hip/issues/2209 + scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b)); +#else + scalar_t min = at::_isnan(b) ? b : std::min(a, b); +#endif + return min; +} +#define MAX(X, Y) max_propagate_nan(X,Y) +#define MIN(X, Y) min_propagate_nan(X,Y) +#else +#include +#define MAX(X, Y) max_impl(X,Y) +#define MIN(X, Y) min_impl(X,Y) +#endif + +// ROCM hcc doesn't work well with using std:: in kernel functions +#if defined(__CUDA_ARCH__) +#include +#define compat_pow c10::cuda::compat::pow +#elif defined(__HIPCC__) +#include +#define compat_pow c10::hip::compat::pow +#else +#define compat_pow std::pow +#endif + +namespace at::native { + +namespace detail { + +#if defined(__CUDACC__) || defined(__HIPCC__) +template using pair = thrust::pair; +#else +template using pair = std::pair; +#endif + +} // namespace detail + +template +struct WelfordData { + scalar_t mean; + scalar_t m2; + index_t n; + scalar_t nf; + + C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {} + + C10_HOST_DEVICE WelfordData( + scalar_t mean, + scalar_t m2, + index_t n, + scalar_t nf) + : mean(mean), m2(m2), n(n), nf(nf) {} +}; + + +template +struct WelfordOps { + acc_scalar_t correction; + bool take_sqrt; + public: + using acc_t = WelfordData; + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const { + // We accumulate n in index_t to avoid cumulative rounding error, but still + // need nf for use in combine where int32 may overflow. + index_t new_n = acc.n + 1; + acc_scalar_t new_nf = static_cast(new_n); + acc_scalar_t delta = data - acc.mean; + acc_scalar_t new_mean = acc.mean + delta / new_nf; + acc_scalar_t new_delta = data - new_mean; + return { + new_mean, + acc.m2 + delta * new_delta, + new_n, + new_nf, + }; + } + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + if (a.nf == 0) { + return b; + } + if (b.nf == 0) { + return a; + } + acc_scalar_t delta = b.mean - a.mean; + acc_scalar_t new_count = a.nf + b.nf; + acc_scalar_t nb_over_n = b.nf / new_count; + return { + a.mean + delta * nb_over_n, + a.m2 + b.m2 + delta * delta * a.nf * nb_over_n, + // setting acc.n as -1 since acc.n might not be able to represent the count + // correctly within its range, setting it to -1 to avoid confusion + -1, + new_count + }; + } + inline C10_DEVICE res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ { + const auto mean = static_cast(acc.mean); + const auto divisor = acc.nf > correction ? acc.nf - correction : 0; + const auto var = acc.m2 / divisor; + res_t results(take_sqrt ? device_sqrt(var) : var, mean); + return results; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const { + return { + WARP_SHFL_DOWN(acc.mean, offset) + , WARP_SHFL_DOWN(acc.m2, offset) + , WARP_SHFL_DOWN(acc.n, offset) + , WARP_SHFL_DOWN(acc.nf, offset) + }; + } +#endif + C10_HOST_DEVICE WelfordOps(acc_scalar_t correction, bool take_sqrt) + : correction(correction), take_sqrt(take_sqrt) {} +}; + +template +struct MeanOps { + factor_t factor; + + inline C10_DEVICE acc_t reduce(acc_t a, scalar_t b, int64_t /*idx*/) const { + return combine(a, static_cast(b)); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a * factor; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +#endif + + MeanOps(factor_t factor): factor(factor) { + } +}; + +// This accumulator template is used to calculate the minimum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct AbsMinOps { + + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MIN(acc, static_cast(std::abs(at::opmath_type(data)))); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return MIN(a, b); + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +// This accumulator template is used to calculate the maximum absolute value of +// a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct AbsMaxOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return MAX(acc, static_cast(std::abs(at::opmath_type(data)))); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return MAX(a, b); + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +// This accumulator template is used to calculate the norm of the absolute value +// of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormOps { + acc_t norm_; + + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + compat_pow(static_cast(std::abs(at::opmath_type(data))), norm_); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return compat_pow(a, static_cast(1.0) / norm_); + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif + + NormOps(acc_t norm_): norm_(norm_) { + } +}; + +// This accumulator template is used to calculate the order zero norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormZeroOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + (data == static_cast(0) ? static_cast(0) : static_cast(1)); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +// This accumulator template is used to calculate the order one norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormOneOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + return acc + static_cast(std::abs(at::opmath_type(data))); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + + +template +struct AbsSwitch {}; + +template +inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch /*unused*/) { + return static_cast(data); +} + +template +inline C10_DEVICE acc_t abs_if_complex(std::complex data, AbsSwitch /*unused*/) { + return static_cast(std::abs(data)); +} + +template +inline C10_DEVICE acc_t abs_if_complex(c10::complex data, AbsSwitch /*unused*/) { + return static_cast(std::abs(at::opmath_type>(data))); +} + +// This accumulator template is used to calculate the order two norm of the +// absolute value of a set of numbers. +// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated +// value. These types differ for complex number input support. +template +struct NormTwoOps { + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { + acc_t data_ = abs_if_complex(data, AbsSwitch()); + return acc + data_ * data_; + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE out_t project(acc_t a) const { + return device_sqrt(a); + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } +#endif +}; + +template +struct NanSumOps { + inline C10_DEVICE acc_t reduce(acc_t a, data_t b, int64_t /*idx*/) const { + return a + (at::_isnan(b) ? acc_t{0.} : acc_t{b}); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return a + b; + } + + inline C10_DEVICE data_t project(acc_t a) const { + return data_t{a}; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +#endif +}; + +namespace detail { + +template +struct LessOrNan { + C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const { + // If (a == b), then choose the one with lower idx, else min(a, b) + if (at::_isnan(a)) { + if (at::_isnan(b)) { + return idx_a < idx_b; + } + return true; + } + return (a == b) ? idx_a < idx_b : (a < b); + } +}; + +template +struct GreaterOrNan { + C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const { + // If (a == b), then choose the one with lower idx, else max(a, b) + if (at::_isnan(a)) { + if (at::_isnan(b)) { + return idx_a < idx_b; + } + return true; + } + return (a == b) ? idx_a < idx_b : (a > b); + } +}; + +template +struct MinMaxReductionOps { + using scalar_t = typename binary_function_traits::arg1_t; + using index_t = int64_t; + using arg_t = detail::pair; + + static C10_DEVICE arg_t project(arg_t arg) { + return arg; + } + + static C10_DEVICE arg_t reduce(arg_t arg, scalar_t val, int64_t idx) { + return comp_t{}(arg.first, val, arg.second, idx) ? arg : arg_t(val, idx); + } + + static C10_DEVICE arg_t combine(arg_t a, arg_t b) { + return comp_t{}(a.first, b.first, a.second, b.second) ? a : b; + } + + static C10_DEVICE arg_t translate_idx(arg_t a, int64_t base_idx) { + return {a.first, a.second + base_idx}; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + static C10_DEVICE arg_t warp_shfl_down(arg_t arg, int offset) { + return arg_t(WARP_SHFL_DOWN(arg.first, offset), + WARP_SHFL_DOWN(arg.second, offset)); + } +#endif +}; + +template +struct ArgReductionOps : public MinMaxReductionOps { + using typename MinMaxReductionOps::scalar_t; + using typename MinMaxReductionOps::index_t; + using typename MinMaxReductionOps::arg_t; + + static C10_DEVICE index_t project(arg_t arg) { + return arg.second; + } +}; + +} // namespace detail + +template +struct ArgMaxOps : + public detail::ArgReductionOps> { +}; + +template +struct ArgMinOps : + public detail::ArgReductionOps> { +}; + +template +struct MinOps : + public detail::MinMaxReductionOps> { +}; + +template +struct MaxOps : + public detail::MinMaxReductionOps> { +}; + +template +struct MinMaxOps { + using acc_t = detail::pair; + inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const { + return combine(acc, {data, data}); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + auto min_val = (at::_isnan(a.first) || a.first < b.first) ? a.first : b.first; + auto max_val = (at::_isnan(a.second) || a.second > b.second) ? a.second : b.second; + + return {min_val, max_val}; + } + + inline C10_DEVICE acc_t project(acc_t acc) const { + return acc; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { + return { + WARP_SHFL_DOWN(acc.first, offset), WARP_SHFL_DOWN(acc.second, offset) + }; + } +#endif +}; + +} // namespace at::native + +#undef MAX +#undef MIN + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..b13470713490a02311357b44b5094b7517674585 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h @@ -0,0 +1,60 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/// This file contains some tensor-agnostic operations to be used in the +/// core functions of the `SobolEngine` +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#endif + +namespace at::native::sobol_utils { + +/// Function to return the minimum of number of bits to represent the integer `n` +inline int64_t bit_length(const int64_t n) { + int64_t nbits, nloc; + for (nloc = n, nbits = 0; nloc > 0; nloc /= 2, nbits++); + return nbits; +} + +/// Function to get the position of the rightmost zero in the bit representation of an integer +/// This value is the zero-indexed position +inline int64_t rightmost_zero(const int64_t n) { + int64_t z, i; + for (z = n, i = 0; z % 2 == 1; z /= 2, i++); + return i; +} + +/// Function to get a subsequence of bits in the representation of an integer starting from +/// `pos` and of length `length` +inline int64_t bitsubseq(const int64_t n, const int64_t pos, const int64_t length) { + return (n >> pos) & ((1 << length) - 1); +} + +/// Function to perform the inner product between a batched square matrix and a power of 2 vector +inline at::Tensor cdot_pow2(const at::Tensor& bmat) { + at::Tensor inter = at::arange(bmat.size(-1) - 1, -1, -1, bmat.options()); + inter = at::pow(2, inter).expand_as(bmat); + return at::mul(inter, bmat).sum(-1); +} + +/// All definitions below this point are data. These are constant, and should not be modified +/// without notice + +constexpr int64_t MAXDIM = 21201; +constexpr int64_t MAXDEG = 18; +constexpr int64_t MAXBIT = 30; +constexpr int64_t LARGEST_NUMBER = 1 << MAXBIT; +constexpr float RECIPD = 1.0 / LARGEST_NUMBER; + +extern const int64_t poly[MAXDIM]; +extern const int64_t initsobolstate[MAXDIM][MAXDEG]; + +} // namespace at::native::sobol_utils + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Sorting.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Sorting.h new file mode 100644 index 0000000000000000000000000000000000000000..753430204919b2ea67c63f74fe2efdc6dc859538 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Sorting.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +enum class QUANTILE_INTERPOLATION_MODE : uint8_t { + LINEAR, + LOWER, + HIGHER, + MIDPOINT, + NEAREST +}; + +using sort_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, bool, bool); +using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool); + +DECLARE_DISPATCH(sort_fn, sort_stub) +DECLARE_DISPATCH(topk_fn, topk_stub) + +void _fill_indices(const TensorBase &indices, int64_t dim); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SortingUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SortingUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..badeba5f6fbffd30e06eb09be18f2e3d16d8e6d5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SortingUtils.h @@ -0,0 +1,93 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +// ensure we get good values and indices for kthvalue, mode +// this will always be with the reducing dim as 1-d +inline void _reduction_with_indices_allocate_or_resize_output( + Tensor& values, + Tensor& indices, + const Tensor& self, + int64_t dim_, + bool keepdim) { + int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); + auto result_sizes = self.sizes().vec(); + if (!result_sizes.empty()) { + result_sizes[dim] = 1; + } + if (values.defined()) { + TORCH_CHECK( + self.options().type_equal(values.options()), + "output values must be of same type as input"); + if (!keepdim && values.dim() == self.dim() - 1) { + // unsqueeze to preserve passed in noncontiguous tensor in resize + values.unsqueeze_(dim); + } + resize_output(values, result_sizes); + } else { + values = at::empty(result_sizes, self.options()); + } + if (indices.defined()) { + TORCH_CHECK( + indices.dtype() == kLong, "output indices must be of scalar type Long"); + TORCH_CHECK( + indices.device() == self.device(), + "output indices must be on same device as input"); + if (!keepdim && indices.dim() == self.dim() - 1) { + // unsqueeze to preserve passed in noncontiguous tensor in resize + indices.unsqueeze_(dim); + } + resize_output(indices, result_sizes); + } else { + indices = at::empty(result_sizes, self.options().dtype(kLong)); + } +} + +// ensure we get good values and indices for topk +inline void _allocate_or_resize_output_with_indices( + Tensor& values, + Tensor& indices, + const Tensor& self, + int64_t dim_, + int64_t k) { + int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); + auto result_sizes = self.sizes().vec(); + if (!result_sizes.empty()) { + result_sizes[dim] = k; + } + if (values.defined()) { + TORCH_CHECK( + self.options().type_equal(values.options()), + "output values must be of same type as input"); + values.resize_(result_sizes); + } else { + values = at::empty(result_sizes, self.options()); + } + if (indices.defined()) { + TORCH_CHECK( + indices.dtype() == kLong, "output indices must be of scalar type Long"); + TORCH_CHECK( + indices.device() == self.device(), + "output indices must be on same device as input"); + indices.resize_(result_sizes); + } else { + indices = at::empty(result_sizes, self.options().dtype(kLong)); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SparseTensorUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SparseTensorUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..3916d49bf3f65de4e3c47f1d223eab7b809fdfe0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SparseTensorUtils.h @@ -0,0 +1,195 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + +namespace at::sparse { + +// Just for documentary purposes +using SparseTensor = Tensor; +using SparseType = Type; + +// This is an internal utility function for getting at the SparseTensorImpl, +// so that we can write sparse tensor specific accessors for special fields +// in SparseTensor. You should only use this for writing low level +// setters/getters for SparseTensorImpl fields; otherwise, you should use +// the low level setters/getters that were implemented using this. +// +// This may be called repeatedly, so make sure it's pretty cheap. +inline SparseTensorImpl* get_sparse_impl(const SparseTensor& self) { + TORCH_INTERNAL_ASSERT( + self.is_sparse(), "_internal_get_SparseTensorImpl: not a sparse tensor"); + return static_cast(self.unsafeGetTensorImpl()); +} + +// Takes indices and values and directly puts them into the sparse tensor, no +// copy. This used to be called THSTensor_(_move) +inline void alias_into_sparse( + const SparseTensor& self, + const Tensor& indices, + const Tensor& values) { + get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values); +} + +// Take indices and values and makes a (data) copy of them to put into the +// sparse indices/values. This used to be called THSTensor_(_set) +inline void copy_into_sparse( + const SparseTensor& self, + const Tensor& indices, + const Tensor& values, + bool non_blocking) { + alias_into_sparse( + self, + indices.to(self._indices().options(), non_blocking, /*copy=*/true), + values.to(self._values().options(), non_blocking, /*copy=*/true)); +} + +// TODO: put this into the public API +inline bool is_same_tensor(const Tensor& lhs, const Tensor& rhs) { + return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl(); +} + +inline bool is_same_density(const SparseTensor& self, const SparseTensor& src) { + return self.sparse_dim() == src.sparse_dim() && + self.dense_dim() == src.dense_dim(); +} + +// Give us a new values tensor, with the same dimensionality +// as 'values' but with a new number of non-zero elements. +// TODO: Expose this for real in ATen, some day? +// NB: Doesn't preserve data. +inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) { + std::vector size = values.sizes().vec(); + size[0] = nnz; + return at::empty(size, values.options()); +} + +// NOTE [ Flatten Sparse Indices ] +// This helper function flattens a sparse indices tensor (a Tensor) into a 1D +// indices tensor. E.g., +// input = [[2, 4, 0], +// [3, 1, 10]] +// full_size = [2, 12] +// output = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 10 ] = [27, 49, 10] +// +// In other words, assuming that each `indices[i, :]` is a valid index to a +// tensor `t` of shape `full_size`. This returns the corresponding indices to +// the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`. +// if forceClone is true, the result will forced to be a clone of self. +// if force_clone is true, the result will forced to be a clone of self. +TORCH_API Tensor flatten_indices( + const Tensor& indices, + IntArrayRef full_size, + bool force_clone = false); + +// Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten +// Sparse Indices ], except this one allows partial flatten: only flatten on +// specified dims. Note that the flatten indices might be uncoalesced if +// dims_to_flatten.size() < sparse_dim. Also if input indices is already +// coalesced, the flattened indices will also be sorted. +// +// args: +// indices: sparse tensor indices +// sizes: sparse tensor sizes +// dims_to_flatten: a list of dim index to flatten +// +// Ex1: +// indices = [[2, 4, 0], +// [3, 1, 3]] +// sizes = [2, 12] +// dims_to_flatten = [0, 1] +// new_indices = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 3 ] = [27, 49, 3] +// +// Ex2: +// dims_to_flatten = [1] +// new_indices = [ 3, 1, 3 ] # uncoalesced +TORCH_API Tensor flatten_indices_by_dims( + const Tensor& indices, + const IntArrayRef& sizes, + const IntArrayRef& dims_to_flatten); + +// Find the CSR representation for a row `indices` from the COO format +TORCH_API Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz); + +TORCH_API Tensor zeros_like_with_indices(const Tensor& t); + +template +class TensorGeometryHolder { + using geometry_holder_t = std::array; + + public: + explicit TensorGeometryHolder( + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options = {}) { + std::copy(sizes.begin(), sizes.end(), t_sizes.begin()); + std::copy(strides.begin(), strides.end(), t_strides.begin()); + } + + explicit TensorGeometryHolder(const Tensor& t) + : TensorGeometryHolder(t.sizes(), t.strides()) {} + + auto operator*() const { + return std::make_tuple(t_sizes, t_strides); + } + + private: + geometry_holder_t t_sizes; + geometry_holder_t t_strides; +}; + +template <> +class TensorGeometryHolder<0> { + using geometry_holder_t = Tensor; + + public: + explicit TensorGeometryHolder( + IntArrayRef sizes, + IntArrayRef strides, + TensorOptions options) { + const int64_t t_ndims = sizes.size(); + const auto cpu_options = TensorOptions(options).dtype(kLong).device(kCPU); + Tensor t_sizes_and_strides_cpu = at::empty({2, t_ndims}, cpu_options); + t_sizes_and_strides_cpu.select(0, 0).copy_(at::tensor(sizes, cpu_options)); + t_sizes_and_strides_cpu.select(0, 1).copy_( + at::tensor(strides, cpu_options)); + const Tensor t_sizes_and_strides = + t_sizes_and_strides_cpu.to(options.device()); + t_sizes = t_sizes_and_strides.select(0, 0); + t_strides = t_sizes_and_strides.select(0, 1); + } + + explicit TensorGeometryHolder(const Tensor& t) + : TensorGeometryHolder(t.sizes(), t.strides(), t.options()) {} + + auto operator*() const { + return std::make_tuple( + t_sizes.template data_ptr(), + t_strides.template data_ptr()); + } + + private: + geometry_holder_t t_sizes; + geometry_holder_t t_strides; +}; + +// Return all indices of a tensor with the given shape. +// +// full_coo_indices(shape) is equivalent to +// torch.ones(shape).nonzero().transpose(-2, -1) but much faster. +TORCH_API Tensor full_coo_indices(IntArrayRef sizes, TensorOptions options); + +} // namespace at::sparse + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SpectralOpsUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SpectralOpsUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..5d7666dc6577133a8bfc7bd03f9b2190e6ffc91d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/SpectralOpsUtils.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { + +// Normalization types used in _fft_with_size +enum class fft_norm_mode { + none, // No normalization + by_root_n, // Divide by sqrt(signal_size) + by_n, // Divide by signal_size +}; + +// NOTE [ Fourier Transform Conjugate Symmetry ] +// +// Real-to-complex Fourier transform satisfies the conjugate symmetry. That is, +// assuming X is the transformed K-dimensional signal, we have +// +// X[i_1, ..., i_K] = X[j_i, ..., j_K]*, +// +// where j_k = (N_k - i_k) mod N_k, N_k being the signal size at dim k, +// * is the conjugate operator. +// +// Therefore, in such cases, FFT libraries return only roughly half of the +// values to avoid redundancy: +// +// X[:, :, ..., :floor(N / 2) + 1] +// +// This is also the assumption in cuFFT and MKL. In ATen SpectralOps, such +// halved signal will also be returned by default (flag onesided=True). +// The following infer_ft_real_to_complex_onesided_size function calculates the +// onesided size from the twosided size. +// +// Note that this loses some information about the size of signal at last +// dimension. E.g., both 11 and 10 maps to 6. Hence, the following +// infer_ft_complex_to_real_onesided_size function takes in optional parameter +// to infer the twosided size from given onesided size. +// +// cuFFT doc: http://docs.nvidia.com/cuda/cufft/index.html#multi-dimensional +// MKL doc: https://software.intel.com/en-us/mkl-developer-reference-c-dfti-complex-storage-dfti-real-storage-dfti-conjugate-even-storage#CONJUGATE_EVEN_STORAGE + +inline int64_t infer_ft_real_to_complex_onesided_size(int64_t real_size) { + return (real_size / 2) + 1; +} + +inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size, + int64_t expected_size=-1) { + int64_t base = (complex_size - 1) * 2; + if (expected_size < 0) { + return base + 1; + } else if (base == expected_size) { + return base; + } else if (base + 1 == expected_size) { + return base + 1; + } else { + std::ostringstream ss; + ss << "expected real signal size " << expected_size << " is incompatible " + << "with onesided complex frequency size " << complex_size; + TORCH_CHECK(false, ss.str()); + } +} + +using fft_fill_with_conjugate_symmetry_fn = + void (*)(ScalarType dtype, IntArrayRef mirror_dims, IntArrayRef half_sizes, + IntArrayRef in_strides, const void* in_data, + IntArrayRef out_strides, void* out_data); +DECLARE_DISPATCH(fft_fill_with_conjugate_symmetry_fn, fft_fill_with_conjugate_symmetry_stub) + +// In real-to-complex transform, cuFFT and MKL only fill half of the values +// due to conjugate symmetry. This function fills in the other half of the full +// fft by using the Hermitian symmetry in the signal. +// self should be the shape of the full signal and dims.back() should be the +// one-sided dimension. +// See NOTE [ Fourier Transform Conjugate Symmetry ] +TORCH_API void _fft_fill_with_conjugate_symmetry_(const Tensor& self, IntArrayRef dims); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/StridedRandomAccessor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/StridedRandomAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..911a18e1b6ed6a5739b24e4eb60052527b43fc4c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/StridedRandomAccessor.h @@ -0,0 +1,306 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at::native { + +// (Const)StridedRandomAccessor is a +// (const) random access iterator defined over +// a strided array. + +// The traits below are to introduce __restrict__ +// modifier on different platforms. + +template +struct DefaultPtrTraits { + using PtrType = T*; +}; + +#if (defined(_WIN32) || defined(_WIN64)) +#define RESTRICT __restrict +#else +#define RESTRICT __restrict__ +#endif + +template +struct RestrictPtrTraits { + using PtrType = T* RESTRICT; +}; + +template < + typename T, + typename index_t = int64_t, + template class PtrTraits = DefaultPtrTraits +> +class ConstStridedRandomAccessor { +public: + using difference_type = index_t; + using value_type = const T; + using pointer = const typename PtrTraits::PtrType; + using reference = const value_type&; + using iterator_category = std::random_access_iterator_tag; + + using PtrType = typename PtrTraits::PtrType; + using index_type = index_t; + + // Constructors { + C10_HOST_DEVICE + ConstStridedRandomAccessor(PtrType ptr, index_t stride) + : ptr{ptr}, stride{stride} + {} + + C10_HOST_DEVICE + explicit ConstStridedRandomAccessor(PtrType ptr) + : ptr{ptr}, stride{static_cast(1)} + {} + + C10_HOST_DEVICE + ConstStridedRandomAccessor() + : ptr{nullptr}, stride{static_cast(1)} + {} + // } + + // Pointer-like operations { + C10_HOST_DEVICE + reference operator*() const { + return *ptr; + } + + C10_HOST_DEVICE + const value_type* operator->() const { + return reinterpret_cast(ptr); + } + + C10_HOST_DEVICE + reference operator[](index_t idx) const { + return ptr[idx * stride]; + } + // } + + // Prefix/postfix increment/decrement { + C10_HOST_DEVICE + ConstStridedRandomAccessor& operator++() { + ptr += stride; + return *this; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor operator++(int) { + ConstStridedRandomAccessor copy(*this); + ++*this; + return copy; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor& operator--() { + ptr -= stride; + return *this; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor operator--(int) { + ConstStridedRandomAccessor copy(*this); + --*this; + return copy; + } + // } + + // Arithmetic operations { + C10_HOST_DEVICE + ConstStridedRandomAccessor& operator+=(index_t offset) { + ptr += offset * stride; + return *this; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor operator+(index_t offset) const { + return ConstStridedRandomAccessor(ptr + offset * stride, stride); + } + + C10_HOST_DEVICE + friend ConstStridedRandomAccessor operator+( + index_t offset, + const ConstStridedRandomAccessor& accessor + ) { + return accessor + offset; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor& operator-=(index_t offset) { + ptr -= offset * stride; + return *this; + } + + C10_HOST_DEVICE + ConstStridedRandomAccessor operator-(index_t offset) const { + return ConstStridedRandomAccessor(ptr - offset * stride, stride); + } + + // Note that this operator is well-defined when `this` and `other` + // represent the same sequences, i.e. when + // 1. this.stride == other.stride, + // 2. |other - this| / this.stride is an Integer. + C10_HOST_DEVICE + difference_type operator-(const ConstStridedRandomAccessor& other) const { + return (ptr - other.ptr) / stride; + } + // } + + // Comparison operators { + C10_HOST_DEVICE + bool operator==(const ConstStridedRandomAccessor& other) const { + return (ptr == other.ptr) && (stride == other.stride); + } + + C10_HOST_DEVICE + bool operator!=(const ConstStridedRandomAccessor& other) const { + return !(*this == other); + } + + C10_HOST_DEVICE + bool operator<(const ConstStridedRandomAccessor& other) const { + return ptr < other.ptr; + } + + C10_HOST_DEVICE + bool operator<=(const ConstStridedRandomAccessor& other) const { + return (*this < other) || (*this == other); + } + + C10_HOST_DEVICE + bool operator>(const ConstStridedRandomAccessor& other) const { + return !(*this <= other); + } + + C10_HOST_DEVICE + bool operator>=(const ConstStridedRandomAccessor& other) const { + return !(*this < other); + } + // } + +protected: + PtrType ptr; + index_t stride; +}; + +template < + typename T, + typename index_t = int64_t, + template class PtrTraits = DefaultPtrTraits +> +class StridedRandomAccessor + : public ConstStridedRandomAccessor { +public: + using difference_type = index_t; + using value_type = T; + using pointer = typename PtrTraits::PtrType; + using reference = value_type&; + + using BaseType = ConstStridedRandomAccessor; + using PtrType = typename PtrTraits::PtrType; + + // Constructors { + C10_HOST_DEVICE + StridedRandomAccessor(PtrType ptr, index_t stride) + : BaseType(ptr, stride) + {} + + C10_HOST_DEVICE + explicit StridedRandomAccessor(PtrType ptr) + : BaseType(ptr) + {} + + C10_HOST_DEVICE + StridedRandomAccessor() + : BaseType() + {} + // } + + // Pointer-like operations { + C10_HOST_DEVICE + reference operator*() const { + return *this->ptr; + } + + C10_HOST_DEVICE + value_type* operator->() const { + return reinterpret_cast(this->ptr); + } + + C10_HOST_DEVICE + reference operator[](index_t idx) const { + return this->ptr[idx * this->stride]; + } + // } + + // Prefix/postfix increment/decrement { + C10_HOST_DEVICE + StridedRandomAccessor& operator++() { + this->ptr += this->stride; + return *this; + } + + C10_HOST_DEVICE + StridedRandomAccessor operator++(int) { + StridedRandomAccessor copy(*this); + ++*this; + return copy; + } + + C10_HOST_DEVICE + StridedRandomAccessor& operator--() { + this->ptr -= this->stride; + return *this; + } + + C10_HOST_DEVICE + StridedRandomAccessor operator--(int) { + StridedRandomAccessor copy(*this); + --*this; + return copy; + } + // } + + // Arithmetic operations { + C10_HOST_DEVICE + StridedRandomAccessor& operator+=(index_t offset) { + this->ptr += offset * this->stride; + return *this; + } + + C10_HOST_DEVICE + StridedRandomAccessor operator+(index_t offset) const { + return StridedRandomAccessor(this->ptr + offset * this->stride, this->stride); + } + + C10_HOST_DEVICE + friend StridedRandomAccessor operator+( + index_t offset, + const StridedRandomAccessor& accessor + ) { + return accessor + offset; + } + + C10_HOST_DEVICE + StridedRandomAccessor& operator-=(index_t offset) { + this->ptr -= offset * this->stride; + return *this; + } + + C10_HOST_DEVICE + StridedRandomAccessor operator-(index_t offset) const { + return StridedRandomAccessor(this->ptr - offset * this->stride, this->stride); + } + + // Note that here we call BaseType::operator- version + C10_HOST_DEVICE + difference_type operator-(const BaseType& other) const { + return (static_cast(*this) - other); + } + // } +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h new file mode 100644 index 0000000000000000000000000000000000000000..a773709dfe66782f4c496dba20daf5a8cf1d1bb9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h @@ -0,0 +1,107 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// Indexing tensors by tensors + +#include +#include +#include +#include + +namespace at { +struct TensorIterator; +} + +namespace at::native { + +using index_put_with_sort_fn = void (*)( + Tensor&, + const c10::List>&, + const Tensor&, + bool accumulate, + bool unsafe); +using index_put_with_sort_quantized_fn = void (*)( + Tensor& self, + const c10::List>& indices, + const Tensor& value, + double scale, + int zero_point, + bool unsafe); +using gather_fn = void (*)( + const Tensor& result, + const Tensor& self, + int64_t dim, + const Tensor& index); +using scatter_fn = void (*)( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src); +using scatter_fill_fn = void (*)( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Scalar& src); +using scatter_add_fn = void (*)( + const Tensor& self, + int64_t dim, + const Tensor& index, + const Tensor& src); +using scatter_reduce_fn = void (*)( + const Tensor& self, + const int64_t dim, + const Tensor& index, + const Tensor& src, + const ReductionType& reduce); +using scatter_scalar_reduce_fn = void (*)( + const Tensor& self, + const int64_t dim, + const Tensor& index, + const Scalar& value, + const ReductionType& reduce); +using scatter_reduce_two_fn = void (*)( + const Tensor& self, + const int64_t dim, + const Tensor& index, + const Tensor& src, + const ReductionType& reduce); + +DECLARE_DISPATCH(index_put_with_sort_fn, index_put_with_sort_stub) +DECLARE_DISPATCH( + index_put_with_sort_quantized_fn, + index_put_with_sort_quantized_stub) +DECLARE_DISPATCH(gather_fn, gather_stub) +DECLARE_DISPATCH(scatter_fn, scatter_stub) +DECLARE_DISPATCH(scatter_fill_fn, scatter_fill_stub) +DECLARE_DISPATCH(scatter_add_fn, scatter_add_stub) +DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub) +DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub) +DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub) + +TORCH_API Tensor& index_out( + Tensor& result, + const Tensor& self, + const c10::List>& indices); + +using scatter_add_expanded_index_fn = + void (*)(const Tensor&, const Tensor&, const Tensor&); +using scatter_reduce_expanded_index_fn = void (*)( + const Tensor&, + const Tensor&, + const Tensor&, + const ReductionType& reduce, + bool); +using gather_expanded_index_fn = + void (*)(const Tensor&, const Tensor&, const Tensor&); + +DECLARE_DISPATCH(scatter_add_expanded_index_fn, scatter_add_expanded_index_stub) +DECLARE_DISPATCH( + scatter_reduce_expanded_index_fn, + scatter_reduce_expanded_index_stub) +DECLARE_DISPATCH(gather_expanded_index_fn, gather_expanded_index_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..014ba51a02e0906fccd57e8ad4f597bb2067388f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h @@ -0,0 +1,110 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + +namespace at::native { +namespace { +#ifndef STRIP_ERROR_MESSAGES +inline std::string shapes_as_str(TensorList tensors) { + std::ostringstream os; + bool first = true; + for (auto& tensor : tensors) { + if (tensor.defined()) { + if (!first) { + os << ", "; + } + os << tensor.sizes(); + first = false; + } + } + return os.str(); +} +#endif +} // anonymous namespace + +inline std::tuple canDispatchToMaskedFill( + const Tensor& self, + const torch::List>& indices, + const Tensor& value) { + if (!(value.numel() == 1 && value.device().is_cpu())) { + return std::make_tuple(false, Tensor()); + } + int64_t num_ind = 0; + Tensor mask; + auto self_device = self.device(); + for (const std::optional& i : indices) { + if (!i.has_value() || !(*i).defined()) { + if (!mask.defined()) { + num_ind++; + } + } else { + const Tensor& index = *i; + if ((index.scalar_type() != kByte && index.scalar_type() != kBool) || + index.device() != self_device || mask.defined()) { + return std::make_tuple(false, Tensor()); + } else { + mask = index; + for (const auto j : c10::irange(index.dim())) { + int64_t srcIdx = num_ind + j; + TORCH_CHECK_INDEX( + index.size(j) == self.size(srcIdx), + "The shape of the mask ", + index.sizes(), + " at index ", + j, + " does not match the shape of the indexed tensor ", + self.sizes(), + " at index ", + srcIdx); + } + num_ind += mask.ndimension(); + } + } + } + for ([[maybe_unused]] const auto i : + c10::irange(num_ind, self.ndimension())) { + mask = mask.unsqueeze(-1); + } + return std::make_tuple(true, mask); +} + +inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) { + checkIndexTensorTypes(orig, /*allow_int*/ true); + // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more + // LongTensors + auto indices = expandTensors(self, orig, /*ensure_same_device=*/true); + // next broadcast all index tensors together + try { + indices = expand_outplace(indices); + } catch (std::exception&) { + TORCH_CHECK_INDEX( + false, + "shape mismatch: indexing tensors could not be broadcast together" + " with shapes ", + shapes_as_str(indices)); + } + // add missing null Tensors so that it matches self.dim() + while (indices.size() < (size_t)self.dim()) { + indices.emplace_back(); + } + // if the non-null indices are not all adjacent, transpose self and indices + // together so that they're adjacent at the front + if (!hasContiguousSubspace(indices)) { + std::tie(self, indices) = transposeToFront(self, indices); + } + for (auto& indice : indices) { + if (indice.defined() && indice.dtype() == at::kInt) { + indice = indice.to(at::kLong); + } + } + + return AdvancedIndex(self, indices); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorCompare.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorCompare.h new file mode 100644 index 0000000000000000000000000000000000000000..76a5cde2a6373a56d39052f8bfc21ab131b1ba6d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorCompare.h @@ -0,0 +1,61 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { +class Tensor; +struct TensorIterator; +struct TensorIteratorBase; +} // namespace at + +namespace at::native { + +using reduce_minmax_fn = + void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool); +using structured_reduce_minmax_fn = + void (*)(const Tensor&, const Tensor&, const Tensor&, int64_t, bool); + +DECLARE_DISPATCH(structured_reduce_minmax_fn, max_stub) +DECLARE_DISPATCH(structured_reduce_minmax_fn, min_stub) + +using where_fn = void (*)(TensorIterator&); +DECLARE_DISPATCH(where_fn, where_kernel) + +using is_infinity_op_fn = void (*)(TensorIteratorBase&); +DECLARE_DISPATCH(is_infinity_op_fn, isposinf_stub) +DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub) + +using mode_fn = void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool); +DECLARE_DISPATCH(mode_fn, mode_stub) + +using clamp_tensor_fn = void (*)(TensorIteratorBase&); +DECLARE_DISPATCH(clamp_tensor_fn, clamp_stub) + +namespace detail { +enum class ClampLimits { Min, Max, MinMax }; +} + +DECLARE_DISPATCH( + void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&), + clamp_scalar_stub) +DECLARE_DISPATCH( + void (*)(TensorIteratorBase&, c10::Scalar), + clamp_min_scalar_stub) +DECLARE_DISPATCH( + void (*)(TensorIteratorBase&, c10::Scalar), + clamp_max_scalar_stub) + +using isin_default_fn = + void (*)(const Tensor&, const Tensor&, bool, const Tensor&); +DECLARE_DISPATCH(isin_default_fn, isin_default_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorConversions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorConversions.h new file mode 100644 index 0000000000000000000000000000000000000000..cdd9cb0310602283ed1b0c2d2ef42c1094985f7d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorConversions.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace at { +class Tensor; +namespace native { +bool to_will_alias( + const Tensor& self, + std::optional dtype, + std::optional layout, + std::optional device, + bool copy, + std::optional optional_memory_format); + +Tensor to_meta(const Tensor& tensor); +std::optional to_meta(const std::optional& tensor); +std::vector to_meta(at::ITensorListRef t_list); +Tensor dense_to_sparse_with_mask( + const Tensor& self, + const Tensor& mask, + std::optional layout, + OptionalIntArrayRef blocksize, + std::optional dense_dim_opt); + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorDimApply.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorDimApply.h new file mode 100644 index 0000000000000000000000000000000000000000..8d3fcbe72b2fd8c0c4fb55574d4acd7dc843d752 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorDimApply.h @@ -0,0 +1,72 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { +// input tensors are non-zero dim and non-empty +template + +void tensor_dim_apply3( + const Tensor& self, + Tensor& values, + Tensor& indices, + int64_t dim, + Function func) { + int ndims = self.dim(); + int tensor_dim_apply_has_finished = 0; + std::vector counter(ndims, 0); + const T1* self_data = self.const_data_ptr(); + T1* values_data = values.data_ptr(); + T2* indices_data = indices.data_ptr(); + int64_t self_stride = self.stride(dim); + int64_t values_stride = values.stride(dim); + int64_t indices_stride = indices.stride(dim); + int self_dim_size = self.size(dim); + + while (!tensor_dim_apply_has_finished) { + func( + self_data, + values_data, + indices_data, + self_dim_size, + self_stride, + values_stride, + indices_stride); + if (ndims == 1) { + break; + } + for (const auto dim_i : c10::irange(ndims)) { + if (dim_i == dim) { + if (dim_i == (ndims - 1)) { + tensor_dim_apply_has_finished = 1; + break; + } + continue; + } + counter[dim_i]++; + self_data += self.stride(dim_i); + values_data += values.stride(dim_i); + indices_data += indices.stride(dim_i); + + if (counter[dim_i] == self.size(dim_i)) { + if (dim_i == ndims - 1) { + tensor_dim_apply_has_finished = 1; + break; + } else { + self_data -= counter[dim_i] * self.stride(dim_i); + values_data -= counter[dim_i] * values.stride(dim_i); + indices_data -= counter[dim_i] * indices.stride(dim_i); + counter[dim_i] = 0; + } + } else { + break; + } + } + } +} +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorFactories.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorFactories.h new file mode 100644 index 0000000000000000000000000000000000000000..9983d87903ee97bf9a3f5ad120fa134d82ba3d3c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorFactories.h @@ -0,0 +1,174 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { +// Different combinations of row, col, and offset can lead to two cases: +// +// Case 1 - Trapezoid (Triangle as a special case): row + offset <= col +// Example A: offset > 0 +// 1 1 0 0 0 +// 1 1 1 0 0 +// 1 1 1 1 0 +// Example B: offset <= 0 +// 0 0 0 +// 1 0 0 +// 1 1 0 +// In this case, we calculate the number of elements in the first row and +// last row of the tril respectively, and then compute the tril size. +// +// Case 2 - Trapezoid + Rectangle: row + offset > col +// Example: +// 1 1 0 +// 1 1 1 +// 1 1 1 +// In this case, we first calculate the size of top trapezoid, and then +// calculate the size of the bottom rectangle. +inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) { + // If either dimension is 0 then the there is no tril + if (row == 0 || col == 0) { + return 0; + } + // number of elements in the first row of the tril + auto m_first_row = offset > 0 ? std::min(col, 1 + offset) + : // upper bounded by col + row + offset > 0; // either 0 or 1 + // number of elements in the last row of the tril, bounded by [0, col] + auto m_last_row = std::max(0, std::min(col, row + offset)); + // number of rows, bounded by [0, row] + auto n_row_all = std::max(0, std::min(row, row + offset)); + auto n_row_trapezoid = (m_last_row - m_first_row + 1); + + // calculate # of elements in the top trapezoid + auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1; + + // calculate # of elements in the bottom rectangle if there is any + auto diff_row = n_row_all - n_row_trapezoid; + if (diff_row > 0) { + tril_size += diff_row * col; + } + + return tril_size; +} + +inline void check_args( + int64_t row, + int64_t col, + std::optional layout_opt) { + TORCH_CHECK(row >= 0, "row must be non-negative, got", row); + TORCH_CHECK(col >= 0, "col must be non-negative, got", col); + if (layout_opt.has_value()) { + TORCH_CHECK( + *layout_opt == at::kStrided, + "only support layout=torch.strided, got", + *layout_opt) + } +} + +using at::check_size_nonnegative; + +// assumes maximum value in created tensor is n-1 (e.g., torch.randperm(n)) +inline void check_supported_max_int_with_precision( + int64_t n, + const Tensor& tensor) { + // match defined() to behavior of checks below + TORCH_CHECK( + at::scalar_tensor(n > 0 ? n - 1 : n, tensor.options()).defined(), + "n is too large for result tensor type: '", + tensor.toString(), + "'"); + + // Ensure sufficient precision for floating point representation. + switch (tensor.scalar_type()) { + case at::ScalarType::Half: + TORCH_CHECK( + n <= (int64_t(1) << 11) + 1, + "n cannot be greater than 2049 for Half type."); + break; + case at::ScalarType::Float: + TORCH_CHECK( + n <= (int64_t(1) << 24) + 1, + "n cannot be greater than 2^24+1 for Float type."); + break; + case at::ScalarType::Double: // Unlikely to happen, but doesn't hurt to + // check + TORCH_CHECK( + n <= (int64_t(1) << 53) + 1, + "n cannot be greater than 2^53+1 for Double type."); + break; + default: + break; + } +} + +// Called by `empty*` functions when deterministic algorithms are enabled to +// fill the tensor with NaN if it is floating point or complex type, or fill +// with max value if it is integer type +inline Tensor& fill_empty_deterministic_(Tensor& tensor) { + if (tensor.is_floating_point() || tensor.is_complex()) { + AT_DISPATCH_V2( + tensor.scalar_type(), + "fill_empty_deterministic_", + AT_WRAP([&]() { + tensor.fill_(std::numeric_limits::quiet_NaN()); + }), + AT_EXPAND(AT_FLOATING_TYPES), + AT_EXPAND(AT_COMPLEX_TYPES), + AT_EXPAND(AT_FLOAT8_TYPES), + kBFloat16, + kHalf, + kComplexHalf); + } else { + AT_DISPATCH_V2( + tensor.scalar_type(), + "fill_empty_deterministic_", + AT_WRAP([&]() { tensor.fill_(std::numeric_limits::max()); }), + kBool, + AT_EXPAND(AT_INTEGRAL_TYPES_V2)); + } + return tensor; +} + +// The ZeroTensor allocator ignores whatever allocation is requested and always +// gives you nullptr +struct ZeroTensorAllocator final : public at::Allocator { + ZeroTensorAllocator(at::Device device) : device_(device) {} + ~ZeroTensorAllocator() override = default; + static void deleter(void* const pointer) { + TORCH_INTERNAL_ASSERT(!pointer); + } + DataPtr allocate(const size_t /*nbytes*/) override { + return {nullptr, nullptr, &deleter, device_}; + } + DeleterFnPtr raw_deleter() const override { + return deleter; + } + void copy_data( + void* dest [[maybe_unused]], + const void* src [[maybe_unused]], + std::size_t count [[maybe_unused]]) const final {} + at::Device device_; +}; + +using binary_fn = void (*)(TensorIterator&); + +DECLARE_DISPATCH(binary_fn, complex_stub) +DECLARE_DISPATCH(binary_fn, polar_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorIterator.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorIterator.h new file mode 100644 index 0000000000000000000000000000000000000000..149e4dd914a0ee816278dd14a1b730980a2b7fa0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorIterator.h @@ -0,0 +1,7 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h new file mode 100644 index 0000000000000000000000000000000000000000..d5ec9b0f2b99536dde9ea31648095ab9fb022122 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h @@ -0,0 +1,57 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +// This file includes utilities for dynamic_casting done by TensorIterator, see +// CUDALoops.cuh and Loops.h. + +// dynamic_casting handles when the types expected by the iterator do not match +// the types of the arguments to the function that is being called. On CUDA, the +// cast is currently pushed down into the kernel (for performance reasons). On +// CPU, there is currently an internal assert that a dynamic_cast is not needed. + +namespace at::native { + +// `needs_dynamic_casting` compares the types expected by iterator +// (i.e. dtypes of the operands) with the actual type of the arguments +// (and returns) of func_t +template ::arity> +struct needs_dynamic_casting { + static bool check(TensorIteratorBase& iter) { + using traits = function_traits; + using cpp_type = typename traits::template arg::type; + using cpp_map = c10::CppTypeToScalarType; + + if (iter.input_dtype(nargs - 1) != cpp_map::value) { + return true; + } + return needs_dynamic_casting::check(iter); + } +}; + +template +struct needs_dynamic_casting { + static bool check(TensorIteratorBase& iter) { + using traits = function_traits; + using cpp_type = typename traits::result_type; + + // we could assert output numbers are correct here, but checks + // (including arity) are currently pushed outside of this struct. + if constexpr (std::is_void_v) { + return false; + } else { + return iter.dtype(0) != c10::CppTypeToScalarType::value; + } + } +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorProperties.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorProperties.h new file mode 100644 index 0000000000000000000000000000000000000000..53cb90246892b6b0e45fef516c4c70516cf14e3c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorProperties.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// See NOTE: [Tensor vs. TensorBase] +namespace at { +class TensorBase; +} + +namespace at::native { + +TORCH_API bool cudnn_is_acceptable(const TensorBase& self); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorShape.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorShape.h new file mode 100644 index 0000000000000000000000000000000000000000..55295ea0a0a7e42962d40dca91161e7d48523676 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorShape.h @@ -0,0 +1,150 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + +namespace at::native { + +TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self); + +inline bool cat_should_skip_tensor(const Tensor& t) { + return t.sym_numel() == 0 && t.dim() == 1; +} + +// Check to see if the shape of tensors is compatible +// for being concatenated along a given dimension. +inline void check_cat_shape_except_dim( + const Tensor& first, + const Tensor& second, + int64_t dimension, + int64_t index) { + int64_t first_dims = first.dim(); + int64_t second_dims = second.dim(); + TORCH_CHECK( + first_dims == second_dims, + "Tensors must have same number of dimensions: got ", + first_dims, + " and ", + second_dims); + for (const auto dim : c10::irange(first_dims)) { + if (dim == dimension) { + continue; + } + int64_t first_dim_size = first.sizes()[dim]; + int64_t second_dim_size = second.sizes()[dim]; + TORCH_CHECK( + first_dim_size == second_dim_size, + "Sizes of tensors must match except in dimension ", + dimension, + ". Expected size ", + static_cast(first_dim_size), + " but got size ", + static_cast(second_dim_size), + " for tensor number ", + index, + " in the list."); + } +} + +inline void check_cat_no_zero_dim(const MaterializedITensorListRef& tensors) { + [[maybe_unused]] int64_t i = 0; + for (const Tensor& t : tensors) { + TORCH_CHECK( + t.dim() > 0, + "zero-dimensional tensor (at position ", + i, + ") cannot be concatenated"); + i++; + } +} + +inline int64_t get_num_splits( + const Tensor& self, + int64_t split_size, + int64_t dim) { + TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor"); + TORCH_CHECK( + split_size >= 0, + "split expects split_size be non-negative, but got split_size=", + split_size); + int64_t dim_size = self.size(dim); + TORCH_CHECK( + split_size > 0 || dim_size == 0, + "split_size can only be 0 if dimension size is 0, " + "but got dimension size of ", + dim_size); + // if split_size is 0 and dimension size is 0, there is 1 split. + int64_t num_splits = 1; + if (split_size != 0) { + // ensuring num_splits is at least 1 makes consistent the case where + // split_size > dim_size (returns a single split). We might want to error + // here, but keep it for BC. + num_splits = std::max((dim_size + split_size - 1) / split_size, 1); + } + return num_splits; +} + +inline bool have_same_ndims(TensorList tensors) { + auto ndim = tensors[0].dim(); + for (const auto tensor_idx : c10::irange(tensors.size())) { + if (tensors[tensor_idx].dim() != ndim) { + return false; + } + } + return true; +} + +inline void leading_dimension_matches(TensorList tensors, int64_t dim) { + auto tensor_zero_size = tensors[0].sizes(); + std::vector leading_dim_sizes( + tensor_zero_size.begin(), tensor_zero_size.begin() + dim); + for (const auto i : c10::irange(tensors.size())) { + at::Tensor tensor = tensors[i]; + for (const auto j : c10::irange(dim)) { + TORCH_CHECK( + tensor.size(j) == leading_dim_sizes[j], + "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors"); + } + } +} + +inline int64_t preprocess_chunk_cat_inputs( + TensorList tensors, + int64_t dim, + int64_t num_chunks) { + TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks"); + TORCH_CHECK( + !tensors.empty(), "_chunk_cat expects a non-empty input tensor list"); + auto expected_dtype = tensors[0].dtype(); + auto expected_device = tensors[0].device(); + for (const auto i : c10::irange(tensors.size())) { + TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor"); + TORCH_CHECK( + tensors[i].dtype() == expected_dtype, + "_chunk_cat expects all input tensors with the same dtype"); + TORCH_CHECK( + tensors[i].device() == expected_device, + "_chunk_cat expects all inputs tensors on the same device"); + } + if (have_same_ndims(tensors)) { + dim = maybe_wrap_dim(dim, tensors[0].dim()); + } else { + TORCH_CHECK( + dim >= 0, + "_chunk_cat expects non-negative dim when input tensors have different ndims") + for (const auto i : c10::irange(tensors.size())) { + TORCH_CHECK( + dim < tensors[i].ndimension(), + "_chunk_cat expects dim < ndim for all input tensors"); + } + } + leading_dimension_matches(tensors, dim); + return dim; +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorTransformations.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorTransformations.h new file mode 100644 index 0000000000000000000000000000000000000000..b9f22183a1275308da9dc2b68a74d3048fd88770 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TensorTransformations.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include + +namespace at::native { + +static inline Tensor roll_common( + const Tensor& self, + IntArrayRef shifts, + IntArrayRef dims) { + TORCH_CHECK(!shifts.empty(), "`shifts` required"); + if (dims.empty() && shifts.size() == 1) { + auto flattened = self.contiguous().view(self.numel()); + return roll(flattened, shifts[0], 0).view(self.sizes()); + } + TORCH_CHECK( + shifts.size() == dims.size(), + "shifts and dimensions must align. shifts: ", + shifts.size(), + ", dims:", + dims.size()); + AT_ASSERT(dims.size() > 1); + auto tail_shifts = shifts.slice(1); + auto tail_dims = dims.slice(1); + auto first_dim_rolled = roll(self, shifts[0], dims[0]); + return at::roll(first_dim_rolled, tail_shifts, tail_dims); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TopKImpl.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TopKImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..dfef5ffb36b2ee1ccb84f1ba2e4882da4ef7d83a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TopKImpl.h @@ -0,0 +1,103 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +#ifdef CPU_CAPABILITY +inline namespace CPU_CAPABILITY { +#else +inline namespace DEFAULT { +#endif + +// Core topk loop, shared between CPU and QuantizedCPU +template +void topk_impl_loop( + const int64_t mode_values_stride, + const int64_t mode_indices_stride, + const int64_t tmp_values_stride, + const int64_t k, + const int64_t dim_size, + const bool largest, + const bool sorted, + char** data, const int64_t* strides, const int64_t n) { + + // If k is zero, then output values and indices are empty tensors + // So iterating over other dims is pointless + if (k == 0) { + return; + } + using elem_t = std::pair; + std::vector queue(dim_size); + for (const auto i : c10::irange(n)) { + TensorAccessor mode_values( + reinterpret_cast(data[0] + i * strides[0]), + &k, &mode_values_stride); + TensorAccessor mode_indices( + reinterpret_cast(data[1] + i * strides[1]), + &k, &mode_indices_stride); + TensorAccessor tmp_values( + reinterpret_cast(data[2] + i * strides[2]), + &dim_size, &tmp_values_stride); + + auto n_2 = dim_size; + auto use_partial_sort = k * 64 <= n_2; + + for (const auto j : c10::irange(n_2)) { + queue[j].first = tmp_values[j]; + queue[j].second = j; + } + + // we want nan to be sorted as top for numpy compatibility + if (use_partial_sort) { + if (largest) { + std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + } else { + std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + } + } else { + if (largest) { + std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + if (sorted) { + std::sort(queue.begin(), queue.begin() + k - 1, + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + } + } else { + std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + if (sorted) { + std::sort(queue.begin(), queue.begin() + k -1, + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + } + } + } + + for (const auto j : c10::irange(k)) { + mode_values[j] = queue[j].first; + mode_indices[j] = queue[j].second; + } + } +} + +} // namespace CPU_CAPABILITY +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TransposeType.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TransposeType.h new file mode 100644 index 0000000000000000000000000000000000000000..df8db2787d469eb272ed3d1dfd44787e42ae3c5c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TransposeType.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-default") + +namespace at::native { + +// Used as an interface between the different BLAS-like libraries +enum class TransposeType { + NoTranspose, + Transpose, + ConjTranspose, +}; + +// Transforms TransposeType into the BLAS / LAPACK format +static inline char to_blas(TransposeType trans) { + switch (trans) { + case TransposeType::Transpose: return 'T'; + case TransposeType::NoTranspose: return 'N'; + case TransposeType::ConjTranspose: return 'C'; + } + TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); +} + +} // namespace at::native + +C10_DIAGNOSTIC_POP() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TriangularOpsUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TriangularOpsUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..86a58a2e01abe58216af6f3b92f877a31bb5c3c5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TriangularOpsUtils.h @@ -0,0 +1,62 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include + +namespace at::native { + +/* + * Given batches of matrices with arbitrary batch dim, + * computes the number of batches for Triu and Tril. This ignores stride 0 dimension + */ +static inline int64_t batchCountTrilTriu(const Tensor& batched_matrices) { + int64_t result = 1; + for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) { + if (batched_matrices.stride(i) != 0) { + result *= batched_matrices.size(i); + } + } + return result; +} + +/* Checks a necessary property for the triu and tril implementations, hence the name. + * Here batch contiguity is checked for tensors with greater than 4 dimensions. + * Contiguous tensors and tensors with less than 3 dimensions pass this check + */ +static inline std::tuple checkTrilTriuBatchContiguous(const Tensor& tensor, bool allow_zero_stride) { + // Complete contiguity is the most desired property, which is why + // we return true if the tensor is contiguous + if (tensor.is_contiguous()) { + auto default_strides_for_size = batched_matrix_contiguous_strides(tensor.sizes()); + if (tensor.strides() == default_strides_for_size) { + return std::make_tuple(true, tensor); + } else { + return std::make_tuple(false, tensor.as_strided(tensor.sizes(), default_strides_for_size)); + } + } + + int64_t dims = tensor.dim(); + + // Tensors with dimension less than 4 are handled by default + if (allow_zero_stride && dims <= 3) { + return std::make_tuple(true, tensor); + } + + int64_t expected_stride = tensor.size(-1) * tensor.size(-2); + for (int64_t i = dims - 3; i >= 0; i--) { + // Skip trivial dimension; + if (allow_zero_stride && i == 0 && (tensor.stride(i) == 0 || tensor.size(i) == 1)) { + continue; + } + if (expected_stride != tensor.stride(i)) { + return std::make_tuple(false, tensor.contiguous()); + } + expected_stride *= tensor.size(i); + } + return std::make_tuple(true, tensor); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TypeProperties.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TypeProperties.h new file mode 100644 index 0000000000000000000000000000000000000000..d276b7e8b8c9683ee776012fd6575781e31ef270 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/TypeProperties.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +struct ResultTypeState { + c10::ScalarType dimResult = ScalarType::Undefined; + c10::ScalarType wrappedResult = ScalarType::Undefined; + c10::ScalarType zeroResult = ScalarType::Undefined; +}; + +TORCH_API ResultTypeState update_result_type_state(const Tensor& tensor, const ResultTypeState& in_state); +TORCH_API ResultTypeState update_result_type_state(const Scalar& scalar, const ResultTypeState& in_state); +TORCH_API ScalarType result_type(const ResultTypeState& state); + +TORCH_API ScalarType result_type(ITensorListRef tensors); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/UnaryOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/UnaryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..5e8a1ff0748d64bd47eb4f99709dd768885d308b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/UnaryOps.h @@ -0,0 +1,133 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at { +class Tensor; +class TensorBase; +struct TensorIteratorBase; +} + +namespace at::native { + +using unary_fn = void(*)(TensorIteratorBase&); +using unary_fn_with_scalar = void(*)(TensorIteratorBase&, const Scalar& a); + +inline namespace CPU_CAPABILITY { +void conj_kernel(TensorIteratorBase &iter); +void neg_kernel(TensorIteratorBase &iter); +void reciprocal_kernel(TensorIteratorBase &iter); +void rsqrt_kernel(TensorIteratorBase& iter); +void sqrt_kernel(TensorIteratorBase& iter); +} // namespace CPU_CAPABILITY + +DECLARE_DISPATCH(unary_fn, abs_stub) +DECLARE_DISPATCH(unary_fn, angle_stub) +DECLARE_DISPATCH(unary_fn, conj_physical_stub) +DECLARE_DISPATCH(unary_fn, acos_stub) +DECLARE_DISPATCH(unary_fn, acosh_stub) +DECLARE_DISPATCH(unary_fn, asinh_stub) +DECLARE_DISPATCH(unary_fn, atanh_stub) +DECLARE_DISPATCH(unary_fn, asin_stub) +DECLARE_DISPATCH(unary_fn, atan_stub) +DECLARE_DISPATCH(unary_fn, bitwise_not_stub) +DECLARE_DISPATCH(unary_fn, logical_not_stub) +DECLARE_DISPATCH(unary_fn, ceil_stub) +DECLARE_DISPATCH(unary_fn, cos_stub) +DECLARE_DISPATCH(unary_fn, cosh_stub) +DECLARE_DISPATCH(unary_fn, digamma_stub) +DECLARE_DISPATCH(unary_fn, special_entr_stub) +DECLARE_DISPATCH(unary_fn, special_erfcx_stub) +DECLARE_DISPATCH(unary_fn, erf_stub) +DECLARE_DISPATCH(unary_fn, erfc_stub) +DECLARE_DISPATCH(unary_fn, erfinv_stub) +DECLARE_DISPATCH(unary_fn, exp_stub) +DECLARE_DISPATCH(unary_fn, exp2_stub) +DECLARE_DISPATCH(unary_fn, expm1_stub) +DECLARE_DISPATCH(unary_fn, floor_stub) +DECLARE_DISPATCH(unary_fn, frac_stub) +DECLARE_DISPATCH(unary_fn, frexp_stub) +DECLARE_DISPATCH(unary_fn, i0_stub) +DECLARE_DISPATCH(unary_fn, special_i0e_stub) +DECLARE_DISPATCH(unary_fn, special_i1_stub) +DECLARE_DISPATCH(unary_fn, special_i1e_stub) +DECLARE_DISPATCH(unary_fn, log_stub) +DECLARE_DISPATCH(unary_fn, log10_stub) +DECLARE_DISPATCH(unary_fn, log1p_stub) +DECLARE_DISPATCH(unary_fn, log2_stub) +DECLARE_DISPATCH(unary_fn, special_ndtri_stub) +DECLARE_DISPATCH(unary_fn, special_log_ndtr_stub) +DECLARE_DISPATCH(unary_fn, neg_stub) + +DECLARE_DISPATCH(unary_fn, reciprocal_stub) +DECLARE_DISPATCH(unary_fn, round_stub) +DECLARE_DISPATCH(unary_fn, rsqrt_stub) +DECLARE_DISPATCH(unary_fn, sigmoid_stub) +DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub) +DECLARE_DISPATCH(unary_fn, sign_stub) +DECLARE_DISPATCH(unary_fn, signbit_stub) +DECLARE_DISPATCH(unary_fn, sgn_stub) +DECLARE_DISPATCH(unary_fn, sin_stub) +DECLARE_DISPATCH(unary_fn, sinc_stub) +DECLARE_DISPATCH(unary_fn, sinh_stub) +DECLARE_DISPATCH(unary_fn, sqrt_stub) +DECLARE_DISPATCH(unary_fn, tan_stub) +DECLARE_DISPATCH(unary_fn, tanh_stub) +DECLARE_DISPATCH(unary_fn, trigamma_stub) +DECLARE_DISPATCH(unary_fn, trunc_stub) +DECLARE_DISPATCH(unary_fn, lgamma_stub) +DECLARE_DISPATCH(unary_fn, special_airy_ai_stub) +DECLARE_DISPATCH(unary_fn, special_bessel_j0_stub) +DECLARE_DISPATCH(unary_fn, special_bessel_j1_stub) +DECLARE_DISPATCH(unary_fn, special_bessel_y0_stub) +DECLARE_DISPATCH(unary_fn, special_bessel_y1_stub) +DECLARE_DISPATCH(unary_fn, special_modified_bessel_i0_stub) +DECLARE_DISPATCH(unary_fn, special_modified_bessel_i1_stub) +DECLARE_DISPATCH(unary_fn, special_modified_bessel_k0_stub) +DECLARE_DISPATCH(unary_fn, special_modified_bessel_k1_stub) +DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k0_stub) +DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k1_stub) +DECLARE_DISPATCH(unary_fn, special_spherical_bessel_j0_stub) + +// NB: these are actually defined in Distribution +DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, std::optional), bernoulli_tensor_stub) +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, std::optional), bernoulli_scalar_stub) +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional), cauchy_stub) +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional), exponential_stub) +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional), geometric_stub) +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional), log_normal_stub) +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional), uniform_stub) +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, std::optional), normal_stub) +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, std::optional), random_from_to_stub) +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional), random_full_64_bits_range_stub) +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional), random_stub) + +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub) +DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub) +DECLARE_DISPATCH( + void (*)(Tensor&, const Tensor&, int64_t, std::optional), + multinomial_with_replacement_stub) +DECLARE_DISPATCH( + void (*)( + TensorIteratorBase&, + std::optional, + std::optional, + std::optional), + nan_to_num_stub) +DECLARE_DISPATCH(void (*)(TensorIteratorBase&, int64_t), round_decimals_stub) + +// Missing unary functions +// digamma +// lgamma +// erfinv +// clone +// contiguous +// zero +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Unfold2d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Unfold2d.h new file mode 100644 index 0000000000000000000000000000000000000000..e686c33e0a2541eed8c308a4dc77bc160d1ccc2f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Unfold2d.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +using unfold2d_copy_fn = void (*)( + ScalarType dtype, + void *finput, + const void *input, + int64_t kH, + int64_t kW, + int64_t dH, + int64_t dW, + int64_t padH, + int64_t padW, + int64_t n_input_plane, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width, + bool is_channels_last +); + +using unfold2d_acc_fn = void (*)( + ScalarType dtype, + void *finput, + void *input, + int64_t kH, + int64_t kW, + int64_t dH, + int64_t dW, + int64_t padH, + int64_t padW, + int64_t n_input_plane, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width, + bool is_channels_last +); + +DECLARE_DISPATCH(unfold2d_copy_fn, unfolded2d_copy_stub) +DECLARE_DISPATCH(unfold2d_acc_fn, unfolded2d_acc_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Unfold3d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Unfold3d.h new file mode 100644 index 0000000000000000000000000000000000000000..b8a2fa55234c0cd8181f8f286c438578df3368d2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/Unfold3d.h @@ -0,0 +1,54 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +void Unfold3dCopyCPU( + ScalarType dtype, + const void *src, + int64_t C, + int64_t X_D, + int64_t X_H, + int64_t X_W, + int64_t Y_D, + int64_t Y_H, + int64_t Y_W, + int64_t kernel_d, + int64_t kernel_h, + int64_t kernel_w, + int64_t stride_d, + int64_t stride_h, + int64_t stride_w, + int64_t pad_d, + int64_t pad_h, + int64_t pad_w, + void* dst); + +void Unfold3dAccCPU( + ScalarType dtype, + const void *src, + int64_t C, + int64_t X_D, + int64_t X_H, + int64_t X_W, + int64_t Y_D, + int64_t Y_H, + int64_t Y_W, + int64_t kernel_d, + int64_t kernel_h, + int64_t kernel_w, + int64_t stride_d, + int64_t stride_h, + int64_t stride_w, + int64_t pad_d, + int64_t pad_h, + int64_t pad_w, + void *dst); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/UnfoldBackward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/UnfoldBackward.h new file mode 100644 index 0000000000000000000000000000000000000000..460ca209afb397ff15af4cde24cc6665f6d380f9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/UnfoldBackward.h @@ -0,0 +1,115 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +namespace at::native { + +using unfold_backward_fn = void (*)( + Tensor& grad_in, + const Tensor& grad, + int64_t dim, + int64_t size, + int64_t step +); + +DECLARE_DISPATCH(unfold_backward_fn, unfold_backward_stub) + +namespace { + +// Note on naming: it is unconventional. +// grad_in does not mean that it is a gradient wrt to input, +// grad_in/grad_out is just an input/output of unfold_backward kernel. + +[[maybe_unused]] TensorIterator _make_unfold_backward_iter_over_grad_out( + Tensor& grad_out, + const Tensor& grad_in, + int64_t dim, + int64_t size, + int64_t step) { + dim = maybe_wrap_dim(dim, grad_out.dim()); + // last dim stores the folds + + auto grad_out_dim_size = ensure_nonempty_size(grad_out, dim); + auto grad_in_dim_size = ensure_nonempty_size(grad_in, dim); + // dictates the number of elements to iterate over + // in dimension `dim` + auto iter_dim_size = std::min( + grad_out_dim_size, + (grad_in_dim_size - 1) * step + size + ); + + /* prepare grad_out for TensorIterator { */ + auto grad_out_strides = ensure_nonempty_vec(grad_out.strides().vec()); + auto grad_out_sizes = ensure_nonempty_vec(grad_out.sizes().vec()); + grad_out_sizes[dim] = iter_dim_size; + auto grad_out_restrided = grad_out.as_strided( + grad_out_sizes, grad_out_strides + ); + /* } */ + + /* prepare grad_in for TensorIterator { */ + auto grad_in_strides = ensure_nonempty_vec(grad_in.strides().vec()); + auto grad_in_sizes = ensure_nonempty_vec(grad_in.sizes().vec()); + + // set strides for dim to 0 + // and size to 1 because + // this dimension is indexed inside the kernel + grad_in_strides[dim] = 0; + grad_in_sizes[dim] = 1; + + grad_in_strides.pop_back(); + grad_in_sizes.pop_back(); + + auto grad_in_restrided = grad_in.squeeze(-1).as_strided( + grad_in_sizes, grad_in_strides + ); + /* } */ + + // During the TensorIterator iteration we have to know + // i_dim in grad_out[i_1,...,i_dim,...i_n], + // idx_dim stores this information + /* prepare idx_dim for TensorIterator { */ + auto idx_dim = at::arange( + 0, iter_dim_size, grad_in.options().dtype(at::kLong) + ); + + auto grad_out_dim = ensure_nonempty_dim(grad_out.dim()); + + auto idx_dim_strides = std::vector(grad_out_dim, 0); + auto idx_dim_sizes = std::vector(grad_out_dim, 1); + + idx_dim_strides[dim] = 1; + idx_dim_sizes[dim] = iter_dim_size; + + // idx_dim size will broadcast over determined by grad_out sizes in TensorIterator + auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides); + /* } */ + + auto iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .add_owned_output(grad_out_restrided) + .add_owned_const_input(grad_in_restrided) + .add_owned_const_input(idx_dim_restrided) + .build(); + + return iter; +} +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/UpSample.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/UpSample.h new file mode 100644 index 0000000000000000000000000000000000000000..e4e210901a0a2960a0191bc8d760201af9e764bf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/UpSample.h @@ -0,0 +1,514 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +/** + * Note [compute_scales_value] + * Note [area_pixel_compute_scale] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Interpolate with scale_factor can have different behaviors + * depending on the value of recompute_scale_factor: + * + * - With recompute_scale_factor = True (current default behavior): + * the scale_factor, when provided by the user, are used to calculate + * the output size. The input size and the computed output_size + * are then used to infer new values for the scales which are + * used in the interpolation. Because floating-point math is not exact, + * this may be a different value from the user-supplied scales. + * + * - With recompute_scale_factor = False (which will be the default + * behavior starting 1.5.0): + * the behavior follows opencv logic, and the scales provided by + * the user are the ones used in the interpolation calculations. + * + * If the scales are not provided or if they are provided but + * recompute_scale_factor is set to True (default behavior), the scales + * are computed from the input and the output size; + * + * + * When the scales are inferred from the input and output sizes, + * we view each pixel as an area, idx + 0.5 as its center index. + * Here is an example formula in 1D case. + * if align_corners: center of two corner pixel areas are preserved, + * (0.5, 0.5) -> (0.5, 0.5), + * (input_size - 0.5, 0.5) -> (output_size - 0.5) + * scale = (input_size - 0.5 - 0.5) / (output_size - 0.5 - 0.5) + * src_index + 0.5 - 0.5 = scale * (dst_index + 0.5 - 0.5) + * if not align_corners: the whole range is scaled accordingly + * scale = input_size / output_size + * src_idx + 0.5 = scale * (dst_index + 0.5) + */ + +namespace at::native { + +namespace upsample { + +TORCH_API c10::SmallVector compute_output_size( + c10::IntArrayRef input_size, // Full input tensor size. + at::OptionalIntArrayRef output_size, + std::optional> scale_factors); + +inline std::optional get_scale_value(std::optional> scales, int idx) { + if (!scales) { + return std::nullopt; + } + return scales->at(idx); +} + +} // namespace upsample + +using scale_t = std::optional; +using upsampling_nearest1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); +using _upsampling_nearest_exact1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w); +using upsampling_nearest2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w); +using _upsampling_nearest_exact2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w); +using upsampling_nearest3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w); +using _upsampling_nearest_exact3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w); +using upsampling_linear1d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_w); +using upsampling_bilinear2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +using _upsampling_bilinear2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +using upsampling_trilinear3d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_d, scale_t scales_h, scale_t scales_w); +using upsampling_bicubic2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +using _upsampling_bicubic2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w); +DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_kernel) +DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_kernel) +DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_kernel) +DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_kernel) +DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_kernel) +DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_kernel) +DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_backward_kernel) +DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_backward_kernel) +DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_backward_kernel) +DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_backward_kernel) +DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_backward_kernel) +DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_backward_kernel) +DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_kernel) +DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_kernel) +DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_kernel) +DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_kernel) +DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_backward_kernel) +DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_backward_kernel) +DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_backward_kernel) +DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_backward_kernel) +DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel) +DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel) +DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel) + +[[maybe_unused]] inline std::array upsample_1d_common_check( + IntArrayRef input_size, + IntArrayRef output_size) { + TORCH_CHECK( + output_size.size() == 1, + "It is expected output_size equals to 1, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 3, + "It is expected input_size equals to 3, but got size ", + input_size.size()); + + int64_t output_width = output_size[0]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_width = input_size[2]; + + TORCH_CHECK( + input_width > 0 && output_width > 0, + "Input and output sizes should be greater than 0, but got input (W: ", + input_width, + ") and output (W: ", + output_width, + ")"); + + return {nbatch, channels, output_width}; +} + +[[maybe_unused]] inline std::array upsample_2d_common_check( + IntArrayRef input_size, + IntArrayRef output_size) { + TORCH_CHECK( + output_size.size() == 2, + "It is expected output_size equals to 2, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 4, + "It is expected input_size equals to 4, but got size ", + input_size.size()); + + int64_t output_height = output_size[0]; + int64_t output_width = output_size[1]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_height = input_size[2]; + int64_t input_width = input_size[3]; + + TORCH_CHECK( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0, + "Input and output sizes should be greater than 0," + " but got input (H: ", + input_height, + ", W: ", + input_width, + ") output (H: ", + output_height, + ", W: ", + output_width, + ")"); + + return {nbatch, channels, output_height, output_width}; +} + +[[maybe_unused]] inline std::array upsample_3d_common_check( + IntArrayRef input_size, + IntArrayRef output_size) { + TORCH_CHECK( + output_size.size() == 3, + "It is expected output_size equals to 3, but got size ", + output_size.size()); + + TORCH_CHECK( + input_size.size() == 5, + "It is expected input_size equals to 5, but got size ", + input_size.size()); + + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + int64_t nbatch = input_size[0]; + int64_t channels = input_size[1]; + int64_t input_depth = input_size[2]; + int64_t input_height = input_size[3]; + int64_t input_width = input_size[4]; + + TORCH_CHECK( + input_depth > 0 && input_height > 0 && input_width > 0 && + output_depth > 0 && output_height > 0 && output_width > 0, + "Input and output sizes should be greater than 0, but got input (D: ", + input_depth, + ", H: ", + input_height, + ", W: ", + input_width, + ") output (D: ", + output_depth, + ", H: ", + output_height, + ", W: ", + output_width, + ")"); + + + return {nbatch, channels, output_depth, output_height, output_width}; +} + +inline void upsample_2d_shape_check( + const Tensor& input, + const Tensor& grad_output, + int64_t nbatch, + int64_t nchannels, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width) { + TORCH_CHECK( + input_height > 0 && input_width > 0 && output_height > 0 && + output_width > 0, + "Input and output sizes should be greater than 0," + " but got input (H: ", + input_height, + ", W: ", + input_width, + ") output (H: ", + output_height, + ", W: ", + output_width, + ")"); + + if (input.defined()) { + // Allow for empty batch size but not other dimensions + TORCH_CHECK( + (input.numel() != 0 || + (input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0) + ) && + input.dim() == 4, + "Non-empty 4D data tensor expected but got a tensor with sizes ", + input.sizes()); + } else if (grad_output.defined()) { + check_dim_size(grad_output, 4, 0, nbatch); + check_dim_size(grad_output, 4, 1, nchannels); + check_dim_size(grad_output, 4, 2, output_height); + check_dim_size(grad_output, 4, 3, output_width); + } +} + +template +inline scalar_t compute_scales_value( + const std::optional scale, + int64_t input_size, + int64_t output_size) { + // see Note [compute_scales_value] + // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. + return (scale.has_value() && scale.value() > 0.) + ? static_cast(1.0 / scale.value()) + : (static_cast(input_size) / output_size); +} + +template +inline scalar_t area_pixel_compute_scale( + int64_t input_size, + int64_t output_size, + bool align_corners, + const std::optional scale) { + // see Note [area_pixel_compute_scale] + if(align_corners) { + if(output_size > 1) { + return static_cast(input_size - 1) / (output_size - 1); + } else { + return static_cast(0); + } + } else { + return compute_scales_value(scale, input_size, output_size); + } +} + +template +inline scalar_t area_pixel_compute_source_index( + scalar_t scale, + int64_t dst_index, + bool align_corners, + bool cubic) { + if (align_corners) { + return scale * dst_index; + } else { + scalar_t src_idx = scale * (dst_index + static_cast(0.5)) - + static_cast(0.5); + // [Note] Follow Opencv resize logic: + // We allow negative src_idx here and later will use + // dx = src_idx - floorf(src_idx) + // to compute the "distance"(which affects weights). + // For linear modes, weight distribution doesn't matter + // for negative indices as they use 2 pixels to interpolate. + // For example, [-1, 0], they both use pixel 0 value so it + // doesn't affect if we bound the src_idx to 0 or not. + // TODO: Our current linear mode impls use unbound indices + // where we should and then remove this cubic flag. + // This matters in cubic mode, as we might need [-1, 0, 1, 2] + // to interpolate and the weights can be affected. + return (!cubic && src_idx < static_cast(0)) ? scalar_t(0) + : src_idx; + } +} + +inline int64_t nearest_neighbor_compute_source_index( + const float scale, + int64_t dst_index, + int64_t input_size) { + // Index computation matching OpenCV INTER_NEAREST + // which is buggy and kept for BC + const int64_t src_index = + std::min(static_cast(floorf(dst_index * scale)), input_size - 1); + return src_index; +} + +inline int64_t nearest_neighbor_exact_compute_source_index( + const float scale, + int64_t dst_index, + int64_t input_size) { + // index_f32 = (output_index + 0.5) * scale - 0.5 + // input_index = round(index_f32) + // Same as Pillow and Scikit-Image/Scipy ndi.zoom + const int64_t src_index = + std::min(static_cast(floorf((dst_index + 0.5) * scale)), input_size - 1); + return src_index; +} + +inline int64_t nearest_idx( + int64_t output_index, + int64_t input_size, + int64_t output_size, + std::optional scales) { + // This method specifically treats cases: output_size == input_size or + // output_size == 2 * input_size, that we would like to get rid of + // We keep this method for BC and consider as deprecated. + // See nearest_exact_idx as replacement + if (output_size == input_size) { + // scale_factor = 1, simply copy + return output_index; + } else if (output_size == 2 * input_size) { + // scale_factor = 2, shift input index + return output_index >> 1; + } else { + float scale = compute_scales_value(scales, input_size, output_size); + return nearest_neighbor_compute_source_index(scale, output_index, input_size); + } +} + +inline int64_t nearest_exact_idx( + int64_t output_index, + int64_t input_size, + int64_t output_size, + std::optional scales) { + float scale = compute_scales_value(scales, input_size, output_size); + return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size); +} + +// Define a typedef to dispatch to nearest_idx or nearest_exact_idx +typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, std::optional); + +template +scalar_t upsample_get_value_bounded( + scalar_t* data, + int64_t width, + int64_t height, + int64_t x, + int64_t y) { + int64_t access_x = std::max(std::min(x, width - 1), static_cast(0)); + int64_t access_y = std::max(std::min(y, height - 1), static_cast(0)); + return data[access_y * width + access_x]; +} + +template +void upsample_increment_value_bounded( + scalar_t* data, + int64_t width, + int64_t height, + int64_t x, + int64_t y, + scalar_t value) { + int64_t access_x = std::max(std::min(x, width - 1), static_cast(0)); + int64_t access_y = std::max(std::min(y, height - 1), static_cast(0)); + data[access_y * width + access_x] += value; +} + +// Based on +// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm +template +scalar_t cubic_convolution1(scalar_t x, scalar_t A) { + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +template +scalar_t cubic_convolution2(scalar_t x, scalar_t A) { + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +template +static inline void get_cubic_upsample_coefficients( + scalar_t coeffs[4], + scalar_t t) { + scalar_t A = -0.75; + + scalar_t x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0, A); + coeffs[1] = cubic_convolution1(x1, A); + + // opposite coefficients + scalar_t x2 = 1.0 - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0, A); +} + +template +inline scalar_t cubic_interp1d( + scalar_t x0, + scalar_t x1, + scalar_t x2, + scalar_t x3, + scalar_t t) { + scalar_t coeffs[4]; + get_cubic_upsample_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +// when `real_input_index` becomes larger than the range the floating point +// type can accurately represent, the type casting to `int64_t` might exceed +// `input_size`, causing overflow. So we guard it with `std::min` below. +template +inline void guard_index_and_lambda(const opmath_t& real_input_index, const int64_t& input_size, int64_t& input_index, scalar_t& lambda) { + input_index = std::min(static_cast(floorf(real_input_index)), input_size - 1); + lambda = std::min( + std::max(real_input_index - input_index, static_cast(0)), + static_cast(1) + ); +} + +template +inline void compute_source_index_and_lambda( + int64_t& input_index0, + int64_t& input_index1, + scalar_t& lambda0, + scalar_t& lambda1, + opmath_t ratio, + int64_t output_index, + int64_t input_size, + int64_t output_size, + bool align_corners) { + if (output_size == input_size) { + // scale_factor = 1, simply copy + input_index0 = output_index; + input_index1 = output_index; + lambda0 = static_cast(1); + lambda1 = static_cast(0); + } else { + const auto real_input_index = + area_pixel_compute_source_index( + ratio, output_index, align_corners, /*cubic=*/false); + guard_index_and_lambda(real_input_index, input_size, input_index0, lambda1); + int64_t offset = (input_index0 < input_size - 1) ? 1 : 0; + input_index1 = input_index0 + offset; + lambda0 = static_cast(1.) - lambda1; + } +} + +// It will not be used by data types other than BFloat16 and Half. +template || !std::is_same_v, int> = 0> +void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) { + TORCH_CHECK((is_reduced_floating_point_v), + "Upsample backward only support BFloat16 and Half in the lower precision data types on CPU.") + TORCH_CHECK((std::is_same_v), + "Upsample backward should use float as acc buffer for BFloat16 and Half grad input on CPU.") + return; +} + +template && std::is_same_v, int> = 0> +void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) { + using bVec = Vectorized; + using fVec = Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec gin_bvec = bVec::loadu(gin + d); + auto [gin_fvec0, gin_fvec1] = convert_to_float(gin_bvec); + gin_fvec0 += fVec::loadu(buffer_ptr + d); + gin_fvec1 += fVec::loadu(buffer_ptr + d + fVec::size()); + fVec(0).store(buffer_ptr + d); + fVec(0).store(buffer_ptr + d + fVec::size()); + convert_from_float(gin_fvec0, gin_fvec1).store(gin + d); + } + for (; d < size; d++) { + gin[d] += buffer_ptr[d]; + buffer_ptr[d] = 0; + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/batch_norm.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/batch_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..5a79be420f02438215df64272a8c5aa46c222e80 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/batch_norm.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +using batch_norm_fn = void (*)(Tensor&, const Tensor&, const Tensor&, + const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, bool, double); +using batch_norm_collect_stats_fn = void (*)(Tensor&, Tensor&, const Tensor&); +using batch_norm_backward_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&, + const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, bool, double); + +DECLARE_DISPATCH(batch_norm_fn, batch_norm_cpu_stub) +DECLARE_DISPATCH(batch_norm_collect_stats_fn, batch_norm_cpu_collect_stats_stub) +DECLARE_DISPATCH(batch_norm_backward_fn, batch_norm_cpu_backward_stub) + +// TensorAccessor when it is defined to work around undefined... +template +static TensorAccessor conditional_accessor_1d(const Tensor& t) { + if (! t.defined()) { + return TensorAccessor(nullptr, nullptr, nullptr); + } + return t.accessor(); +} + +template +static scalar_t* conditional_data_ptr(const Tensor& t) { + if constexpr (std::is_const_v) { + return t.defined() ? t.contiguous().const_data_ptr() + : nullptr; + } else { + return t.defined() ? t.contiguous().data_ptr() + : nullptr; + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h new file mode 100644 index 0000000000000000000000000000000000000000..1c10579e97b97a90e215e764777c120763df1c09 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h @@ -0,0 +1,42 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef ATOMIC_ADD_FLOAT +#define ATOMIC_ADD_FLOAT + +#if (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)) +#include +#else +#define _mm_pause() +#endif + +#include + +static inline void cpu_atomic_add_float(float* dst, float fvalue) +{ + typedef union { + unsigned intV; + float floatV; + } uf32_t; + + uf32_t new_value, old_value; + std::atomic* dst_intV = (std::atomic*)dst; + + old_value.floatV = *dst; + new_value.floatV = old_value.floatV + fvalue; + + unsigned* old_intV = &old_value.intV; + while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) { +#ifdef __aarch64__ + __asm__ __volatile__("yield;" : : : "memory"); +#else + _mm_pause(); +#endif + old_value.floatV = *dst; + new_value.floatV = old_value.floatV + fvalue; + } +} + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CatKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CatKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c1d088a57f4a74af873ff240661852c76af3e144 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CatKernel.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +using cat_serial_fn = void(*)(const Tensor &, const MaterializedITensorListRef&, int64_t); +DECLARE_DISPATCH(cat_serial_fn, cat_serial_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..242fdcbdd5eeaa2e893cad9dcc35a9cfe57d2183 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +using channel_shuffle_fn = void(*)(TensorBase&, const TensorBase&, int64_t); +DECLARE_DISPATCH(channel_shuffle_fn, channel_shuffle_kernel) + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CopyKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CopyKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..314bc2c06d7acdd436d11bc9d20eb4553efee103 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/CopyKernel.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at { +struct TensorIteratorBase; + +namespace native { +inline namespace CPU_CAPABILITY { + +void direct_copy_kernel(TensorIteratorBase &iter); +void copy_kernel(TensorIterator& iter, bool /*non_blocking*/); + +}}} // namespace at::native::CPU_CAPABILITY + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..58cc1809d43f8a937d2485fd40070e723f39db48 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +/* + Depthwise 3x3 Winograd convolution operator +*/ + +namespace at { +class Tensor; + +namespace native { + +using convolution_depthwise3x3_winograd_fn = + Tensor (*)(const Tensor &, const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, int64_t); + +DECLARE_DISPATCH(convolution_depthwise3x3_winograd_fn, convolution_depthwise3x3_winograd_stub) + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h new file mode 100644 index 0000000000000000000000000000000000000000..86cf48ff2a6823ccb27987121fd797d75e2b70e5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DistributionTemplates.h @@ -0,0 +1,430 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CPU_CAPABILITY_AVX2 +#include +#include +#endif + + + + +namespace at::native::templates::cpu { +namespace { + +// ==================================================== Random ======================================================== + +template +void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG generator) { + AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cpu", AT_WRAP([&] { + std::lock_guard lock(generator->mutex_); + cpu_serial_kernel(iter, [range, base, generator]() -> scalar_t { + uniform_int_from_to_distribution random(range, base); + return random(generator); + }); + }), kBool, kHalf, kBFloat16, AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); +} + +// This is the special kernel to handle single specific case: +// from(inclusive) = std::numeric_limits::lowest() +// to(exclusive) = None (= std::numeric_limits::max() + 1) +template +void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG generator) { + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cpu", [&] { + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + std::lock_guard lock(generator->mutex_); + cpu_serial_kernel(iter, [generator]() -> scalar_t { + uniform_int_full_range_distribution random; + return random(generator); + }); + } else { + TORCH_CHECK(false, "random_full_64_bits_range_kernel_cpu handles only int64, double, float and bfloat16"); + } + }); +} + +template +struct RandomFromToKernel { + void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { + random_from_to_kernel(iter, range, base, check_generator(gen)); + } + void operator()(TensorIteratorBase& iter, std::optional gen) { + random_full_64_bits_range_kernel(iter, check_generator(gen)); + } +}; + +template +void random_kernel(TensorIteratorBase& iter, RNG generator) { + std::lock_guard lock(generator->mutex_); + AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cpu", [&] { + cpu_serial_kernel(iter, [generator]() -> scalar_t { + uniform_int_distribution random; + return random(generator); + }); + }); +} + +template +struct RandomKernel { + void operator()(TensorIteratorBase& iter, std::optional gen) { + random_kernel(iter, check_generator(gen)); + } +}; + +// ==================================================== Normal ======================================================== + +#ifdef CPU_CAPABILITY_AVX2 +void normal_fill_16_AVX2(float *data, + const __m256* two_pi, + const __m256* one, + const __m256* minus_two, + const __m256* mean, + const __m256* std_v) { + const __m256 u1 = _mm256_sub_ps(*one, _mm256_loadu_ps(data)); + const __m256 u2 = _mm256_loadu_ps(data + 8); + // sincos256_ps and log256_ps are from avx_mathfun.h + const __m256 radius = _mm256_sqrt_ps(_mm256_mul_ps(*minus_two, log256_ps(u1))); + const __m256 theta = _mm256_mul_ps(*two_pi, u2); + __m256 sintheta, costheta; + sincos256_ps(theta, &sintheta, &costheta); + const __m256 n1 = _mm256_mul_ps(radius, costheta); + const __m256 n2 = _mm256_mul_ps(radius, sintheta); + _mm256_storeu_ps(data, _mm256_fmadd_ps(n1, *std_v, *mean)); + _mm256_storeu_ps(data + 8, _mm256_fmadd_ps(n2, *std_v, *mean)); +} + +template +void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) { + float *data = self.data_ptr(); + auto size = self.numel(); + std::lock_guard lock(generator->mutex_); + for (const auto i : c10::irange(size)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + const __m256 two_pi = _mm256_set1_ps(2.0f * c10::pi); + const __m256 one = _mm256_set1_ps(1.0f); + const __m256 minus_two = _mm256_set1_ps(-2.0f); + const __m256 mean_v = _mm256_set1_ps(mean); + const __m256 std_v = _mm256_set1_ps(std); + + for (int64_t i = 0; i < size - 15; i += 16) { + normal_fill_16_AVX2(data + i, &two_pi, &one, &minus_two, &mean_v, &std_v); + } + + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (const auto i : c10::irange(16)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + normal_fill_16_AVX2(data, &two_pi, &one, &minus_two, &mean_v, &std_v); + } +} +#endif + +template +void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) { + for (const auto j : c10::irange(8)) { + const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log. + const scalar_t u2 = data[j + 8]; + const scalar_t radius = std::sqrt(-2 * std::log(u1)); + const scalar_t theta = 2.0f * c10::pi * u2; + data[j] = radius * std::cos(theta) * std + mean; + data[j + 8] = radius * std::sin(theta) * std + mean; + } +} + +#if defined(__VSX__) || defined(CPU_CAPABILITY_VSX) +static void normal_fill_16_VSX(float *data,const Vectorized &two_pi,const Vectorized &one,const Vectorized &minus_two,const Vectorized &mean,const Vectorized &std) { + using Vec = Vectorized; + Vec u1=one-Vec::loadu(data); + Vec u2=Vec::loadu(data+8); + Vec radius=(minus_two * u1.log()); + radius=radius.sqrt(); + Vec theta=two_pi * u2; + Vec output_vec=radius * theta.cos() * std + mean; + Vec output_vec2=radius * theta.sin() * std + mean; + output_vec.store(data); + output_vec2.store(data+8); +} + +template +void normal_fill_VSX(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) { + float *data = self.data_ptr(); + auto size = self.numel(); + std::lock_guard lock(generator->mutex_); + for (const auto i : c10::irange(size)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + + using Vec = Vectorized; + const Vec two_pi = Vec(2.0f * c10::pi); + const Vec one = Vec(1.0f); + const Vec minus_two = Vec(-2.0f); + const Vec var_vec = Vec(std); + const Vec mean_vec = Vec(mean); + + for (int64_t i = 0; i < size - 15; i += 16) { + if(Vec::size()==8) { + normal_fill_16_VSX(data + i, two_pi, one, minus_two, mean_vec, var_vec); + } + else{ + normal_fill_16(data + i, mean, std); + } + } + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (const auto i : c10::irange(16)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + if(Vec::size()==8){ + normal_fill_16_VSX(data, two_pi, one, minus_two, mean_vec, var_vec); + } + else{ + normal_fill_16(data, mean, std); + } + } +} +#endif //VSX + +template +void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) { + scalar_t *data = self.data_ptr(); + auto size = self.numel(); + std::lock_guard lock(generator->mutex_); + for (const auto i : c10::irange(size)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + + for (int64_t i = 0; i < size - 15; i += 16) { + normal_fill_16(data + i, mean, std); + } + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (const auto i : c10::irange(16)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + normal_fill_16(data, mean, std); + } +} + +template +void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) { + auto size = self.numel(); + if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) { +#ifdef CPU_CAPABILITY_AVX2 + normal_fill_AVX2(self, static_cast(mean), static_cast(std), generator); +#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX) + normal_fill_VSX(self, static_cast(mean), static_cast(std), generator); +#else + normal_fill(self, static_cast(mean), static_cast(std), generator); +#endif + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, self.scalar_type(), "normal_kernel_cpu", [&] { + if (size >= 16 && self.is_contiguous()) { + normal_fill(self, static_cast(mean), static_cast(std), generator); + } else { + auto iter = TensorIterator::borrowing_nullary_op(self); + std::lock_guard lock(generator->mutex_); + cpu_serial_kernel(iter, [mean, std, generator]() -> scalar_t { + at::normal_distribution normal(mean, std); + return static_cast(normal(generator)); + }); + } + }); + } +} + +template +struct NormalKernel { + void operator()(Tensor& self, double mean, double std, std::optional gen) { + normal_kernel(self, mean, std, check_generator(gen)); + } +}; + +// ==================================================== Uniform ======================================================= + +template +void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG generator) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "uniform_kernel_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + auto from = static_cast(from_); + auto to = static_cast(to_); + at::uniform_real_distribution uniform(from, to); + cpu_serial_kernel(iter, [&uniform, generator]() -> scalar_t { + return static_cast(uniform(generator)); + }); + }); +} + +template +struct UniformKernel { + void operator()(TensorIteratorBase& iter, double from, double to, std::optional gen) { + uniform_kernel(iter, from, to, check_generator(gen)); + } +}; + +// ==================================================== Cauchy ======================================================== + +template +void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, RNG generator) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "cauchy_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + at::cauchy_distribution cauchy(median, sigma); + cpu_serial_kernel(iter, [&cauchy, generator]() -> scalar_t { + return static_cast(cauchy(generator)); + }); + }); +} + +template +struct CauchyKernel { + void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { + cauchy_kernel(iter, median, sigma, check_generator(gen)); + } +}; + +// ================================================== LogNormal ======================================================= + +template +void log_normal_kernel(TensorIteratorBase& iter, double mean, double std, RNG generator) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + at::lognormal_distribution logNormal(mean, std); + cpu_serial_kernel(iter, [&logNormal, generator]() -> scalar_t { + return static_cast(logNormal(generator)); + }); + }); +} + +template +struct LogNormalKernel { + void operator()(TensorIteratorBase& iter, double mean, double std, std::optional gen) { + log_normal_kernel(iter, mean, std, check_generator(gen)); + } +}; + +// =================================================== Geometric ====================================================== + +template +void geometric_kernel(TensorIteratorBase& iter, double p, RNG generator) { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + at::geometric_distribution geometric(p); + cpu_serial_kernel(iter, [&geometric, generator]() -> scalar_t { + return static_cast(geometric(generator)); + }); + }); +} + +template +struct GeometricKernel { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { + geometric_kernel(iter, p, check_generator(gen)); + } +}; + +// ================================================== Exponential ===================================================== + +template +void exponential_kernel(TensorIteratorBase& iter, double lambda, RNG generator) { + TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype()); + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cpu", [&]() { + std::lock_guard lock(generator->mutex_); + at::exponential_distribution exponential(lambda); + cpu_serial_kernel(iter, [&exponential, generator]() -> scalar_t { + return static_cast(exponential(generator)); + }); + }); +} + +template +struct ExponentialKernel { + void operator()(TensorIteratorBase& iter, double lambda, std::optional gen) { + exponential_kernel(iter, lambda, check_generator(gen)); + } +}; + +// ================================================== Bernoulli ======================================================= + +template +void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) { + AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, + self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(generator->mutex_); + using self_t = scalar_t; + auto p_cpu = p_.to(kCPU); + auto p = expand_inplace(self, p_cpu); + auto iter = TensorIteratorConfig() + .add_output(self) + .add_const_input(*p) + .check_all_same_dtype(false) + .build(); + if (p->scalar_type() == kDouble) { + cpu_serial_kernel(iter, [&](const double p_val) -> self_t { + at::bernoulli_distribution bernoulli(p_val); + return static_cast(bernoulli(generator)); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, + p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] { + using p_t = scalar_t; + cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t { + at::bernoulli_distribution bernoulli(p_val); + return static_cast(bernoulli(generator)); + }); + }); + } + }); +} + +template +void bernoulli_kernel(const TensorBase &self, double p, RNG generator) { + AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, + self.scalar_type(), "bernoulli_scalar_cpu_", [&] { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(generator->mutex_); + auto iter = TensorIterator::borrowing_nullary_op(self); + cpu_serial_kernel(iter, [p, generator]() -> scalar_t { + at::bernoulli_distribution bernoulli(p); + return static_cast(bernoulli(generator)); + }); + }); +} + +template +struct BernoulliKernel { + void operator()(const TensorBase &self, double p, std::optional gen) { + bernoulli_kernel(self, p, check_generator(gen)); + } + void operator()(const TensorBase &self, const TensorBase &p_, std::optional gen) { + bernoulli_kernel(self, p_, check_generator(gen)); + } +}; + +}} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Elu.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Elu.h new file mode 100644 index 0000000000000000000000000000000000000000..a371690139f4b05463eb97488e3b7e655a977601 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Elu.h @@ -0,0 +1,79 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to +// access constants such as M_SQRT2 and M_2_SQRTPI. +#ifdef _WIN32 +#define _USE_MATH_DEFINES +#include +#endif // _WIN32 + +#include +#include // For c10::is_reduced_floating_point_v. + +namespace at::native { +inline namespace CPU_CAPABILITY { +/** + * Return a function object that calculates ELU with the given + * parameters on its input element. ParamT is the type of the input + * and output to the ELU, and MathT is the type (possibly + * higher-precision, e.g. float if ParamT is reduced-precision float) + * in which to do intermediate calculations. + */ +template +auto get_scalar_elu_elementwise_func(MathT alpha, MathT scale, MathT input_scale) { + const auto negcoef = alpha * scale; + const auto poscoef = scale; + const auto negiptcoef = input_scale; + return [negcoef, negiptcoef, poscoef](ParamT a) -> ParamT { + return MathT(a) < MathT(0) + ? std::expm1(MathT(a) * negiptcoef) * negcoef + : MathT(a) * poscoef; + }; +} + +/** + * Return a function object that calculates ELU with the given + * parameters on its input element. The function object takes and + * returns Vectorized. + */ +template , bool> = true> +auto get_vectorized_elu_elementwise_func(T alpha, T scale, T input_scale) { + const vec::Vectorized negcoef_vec(alpha * scale); + const vec::Vectorized poscoef_vec(scale); + const vec::Vectorized negiptcoef_vec(input_scale); + const vec::Vectorized zero_vec(static_cast(0)); + return [negcoef_vec, poscoef_vec, negiptcoef_vec, zero_vec](vec::Vectorized a) -> vec::Vectorized { + const auto cmp = a >= zero_vec; + if (!cmp.zero_mask()) { + return a * poscoef_vec; + } else { + return vec::Vectorized::blendv((a * negiptcoef_vec).expm1() * negcoef_vec, a * poscoef_vec, cmp); + } + }; +} + +/** + * Return a function object that calculates ELU with the given + * parameters on its input element. The function object takes and + * returns Vectorized, and Vectorized is the type + * (possibly higher-precision) in which to do intermediate + * calculations. + */ +template , bool> = true> +auto get_vectorized_elu_elementwise_func(float alpha, float scale, float input_scale) { + // Takes float->float. + const auto float_func = get_vectorized_elu_elementwise_func(alpha, scale, input_scale); + return [float_func](vec::Vectorized a) -> vec::Vectorized { + auto [a0, a1] = vec::convert_to_float(a); + auto res0 = float_func(a0); + auto res1 = float_func(a1); + return vec::convert_from_float(res0, res1); + }; +} +} // namespace CPU_CAPABILITY +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Gelu.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Gelu.h new file mode 100644 index 0000000000000000000000000000000000000000..e214126e00d106b18702cb926c9a0c1fe5550c27 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Gelu.h @@ -0,0 +1,88 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to +// access constants such as M_SQRT2 and M_2_SQRTPI. +#ifdef _WIN32 +#define _USE_MATH_DEFINES +#include +#include +#endif // _WIN32 + +#include +#include // For c10::is_reduced_floating_point_v. + +namespace at::native { +inline namespace CPU_CAPABILITY { +constexpr double kGeluBeta = M_SQRT2 * M_2_SQRTPI * 0.5; +constexpr double kGeluKappa = 0.044715; + +template +using reduced_fp_to_float_t = std::conditional_t, float, T>; + +template , bool> = true> +float reduced_fp_to_float(T x) { + return float(x); +} + +template , bool> = true> +T reduced_fp_to_float(T x) { + return x; +} + +template +T scalar_gelu_approximated_with_tanh(T x) { + using opmath_t = reduced_fp_to_float_t; + auto x_float = reduced_fp_to_float(x); + auto x_cube = x_float * x_float * x_float; + auto inner = opmath_t(kGeluBeta) * (x_float + opmath_t(kGeluKappa) * x_cube); + return opmath_t(0.5) * x_float * (opmath_t(1) + std::tanh(inner)); +} + +template , bool> = true> +vec::Vectorized vectorized_gelu_approximated_with_tanh(vec::Vectorized x) { + const vec::Vectorized kPointFiveVec(T(0.5)); + const vec::Vectorized kOneVec(T(1)); + const vec::Vectorized kGeluBetaVec((T(kGeluBeta))); + const vec::Vectorized kGeluKappaVec((T(kGeluKappa))); + auto x_cube = x * x * x; + vec::Vectorized inner_vec = kGeluBetaVec * (x + kGeluKappaVec * x_cube); + return kPointFiveVec * x * (kOneVec + inner_vec.tanh()); +} + +template , bool> = true> +vec::Vectorized vectorized_gelu_approximated_with_tanh(vec::Vectorized x) { + auto [x0, x1] = at::vec::convert_to_float(x); + return at::vec::convert_from_float( + vectorized_gelu_approximated_with_tanh(x0), + vectorized_gelu_approximated_with_tanh(x1)); +} + + +template +T scalar_gelu(T x) { + using opmath_t = reduced_fp_to_float_t; + const auto kAlpha = opmath_t(M_SQRT1_2); + return reduced_fp_to_float(x) * opmath_t(0.5) * (opmath_t(1) + std::erf(reduced_fp_to_float(x) * kAlpha)); +} + +template, bool> = true> +vec::Vectorized vectorized_gelu(vec::Vectorized x) { + const vec::Vectorized kAlphaVec(T(M_SQRT1_2)); + const vec::Vectorized kOneVec(T(1)); + const vec::Vectorized kPointFiveVec(T(0.5)); + return x * kPointFiveVec * (kOneVec + (x * kAlphaVec).erf()); +} + +template, bool> = true> +vec::Vectorized vectorized_gelu(vec::Vectorized x) { + auto [x0, x1] = at::vec::convert_to_float(x); + return at::vec::convert_from_float(vectorized_gelu(x0), vectorized_gelu(x1)); +} + +} // namespace CPU_CAPABILITY +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..657e54f03a7255746b63f5f7447d8c6e30fd5a70 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h @@ -0,0 +1,39 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +using forward_2d_fn = void (*) ( + const TensorBase &output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners); +using backward_2d_fn = void (*) ( + const TensorBase &grad_input, + const TensorBase &grad_grid, + const TensorBase &grad_output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners, + std::array output_mask); +DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel) +DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..3c4e71dc5f1e101980fcedc45e591eaff3bd10d3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h @@ -0,0 +1,90 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +inline bool is_constant_index(int ntensor, const int64_t* strides) { + AT_ASSERT(ntensor >= 3); + for (const auto arg : c10::irange(2, ntensor)) { + if (strides[arg] != 0) { + return false; + } + } + return true; +} + + +struct Indexer { + Indexer(int64_t num_indexers, char** indexers, const int64_t* indexer_strides, + IntArrayRef original_sizes, IntArrayRef original_strides) + : num_indexers(num_indexers) + , indexers(indexers) + , indexer_strides(indexer_strides) + , original_strides(original_strides.data()) + , original_sizes(original_sizes.data()) { + AT_ASSERT(static_cast(original_strides.size()) == num_indexers); + AT_ASSERT(static_cast(original_sizes.size()) == num_indexers); + } + + int64_t num_indexers; + char** indexers; + const int64_t* indexer_strides; + const int64_t* original_strides; + const int64_t* original_sizes; + + int64_t get(int64_t idx) { + int64_t offset = 0; + for (const auto j : c10::irange(num_indexers)) { + int64_t value = *(int64_t*)&indexers[j][idx * indexer_strides[j]]; + int64_t size = original_sizes[j]; + TORCH_CHECK_INDEX(value >= -size && value < size, + "index ", value, " is out of bounds for dimension ", j, " with size ", size); + if (value < 0) { + value += size; + } + offset += value * original_strides[j]; + } + return offset; + } +}; + +template +void cpu_index_kernel(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride, + const func_t& f, bool serial_execution=false) +{ + int ntensor = iter.ntensors(); + // When launch the index parallel version, set a relative small grain size less than the INTERNAL::GRAIN_SIZE + // to make the whole available thread numbers get more balanced work load and a better cache location. + // The grain size here is chosen by the op benchmark to overcome the thread launch overhead + const int index_parallel_grain_size = 3000; + auto loop = [&](char** data, const int64_t* strides, int64_t n) { + auto indexer = Indexer(ntensor - 2, &data[2], &strides[2], index_size, index_stride); + char* dst = data[0]; + char* src = data[1]; + if (is_constant_index(ntensor, strides)) { + // specialization for when every element uses the same index + int64_t offset = indexer.get(0); + for (const auto i : c10::irange(n)) { + f(dst + strides[0] * i, src + strides[1] * i, offset); + } + } else { + for (const auto i : c10::irange(n)) { + int64_t offset = indexer.get(i); + f(dst + strides[0] * i, src + strides[1] * i, offset); + } + } + }; + if (serial_execution) { + iter.serial_for_each(loop, {0, iter.numel()}); + } else { + iter.for_each(loop, index_parallel_grain_size); + } +} +} // at +// native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Intrinsics.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Intrinsics.h new file mode 100644 index 0000000000000000000000000000000000000000..760927f022034d5b7ae136ece8afac3cf9f1d4d3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Intrinsics.h @@ -0,0 +1,38 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +/* Clang-compatible compiler, targeting x86/x86-64 */ +#include +#elif defined(_MSC_VER) +/* Microsoft C/C++-compatible compiler */ +#include +#if _MSC_VER <= 1900 +#define _mm256_extract_epi64(X, Y) (((uint64_t*)&X)[Y]) +#endif +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +/* GCC-compatible compiler, targeting x86/x86-64 */ +#include +#elif defined(__GNUC__) && defined(__ARM_NEON__) +/* GCC-compatible compiler, targeting ARM with NEON */ +#include +#elif defined(__GNUC__) && defined(__IWMMXT__) +/* GCC-compatible compiler, targeting ARM with WMMX */ +#include +#elif (defined(__GNUC__) || defined(__xlC__)) && \ + (defined(__VEC__) || defined(__ALTIVEC__)) +/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ +#include +/* We need to undef those tokens defined by to avoid conflicts + with the C++ types. => Can still use __bool/__vector */ +#undef bool +#undef vector +#undef pixel +#elif defined(__GNUC__) && defined(__SPE__) +/* GCC-compatible compiler, targeting PowerPC with SPE */ +#include +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IsContiguous.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IsContiguous.h new file mode 100644 index 0000000000000000000000000000000000000000..caf687c87dfc9820600bd4c581a0125acb821e82 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IsContiguous.h @@ -0,0 +1,69 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at::native { inline namespace CPU_CAPABILITY { + +// n: number of function arguments (arity) +// traits: function_traits (see FunctionTraits.h) +// s: index of scalar argument or -1 +template +struct IsContiguous { + static bool eval(const int64_t* strides) { + using type = typename traits::template arg::type; + return strides[stride_index] == (s == n ? 0 : sizeof(type)) && + IsContiguous::eval(strides); + } +}; + +// will be called when there is an output exists +template +struct IsContiguous<0, 0, traits, s> { + static bool eval(const int64_t* strides) { + return strides[0] == sizeof(typename traits::result_type); + } +}; + +// will be called when there is no output +template +struct IsContiguous<0, -1, traits, s> { + static bool eval(const int64_t* /*strides*/) { + return true; + } +}; + +// output and all inputs are contiguous +template < + typename traits, + std::enable_if_t>* = + nullptr> +static inline bool is_contiguous(const int64_t* strides) { + return IsContiguous::eval(strides); +} + +template >* = nullptr> +static inline bool is_contiguous(const int64_t* strides) { + return IsContiguous::eval(strides); +} + +// input at `s` is scalar (stride 0); output and other inputs are contiguous +// NB: output is typically at strides[0] so first input corresponds to s=1 +template >* = nullptr> +static inline bool is_contiguous_scalar(const int64_t* strides) { + static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds"); + return IsContiguous::eval(strides); +} + +template >* = nullptr> +static inline bool is_contiguous_scalar(const int64_t* strides) { + static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds"); + return IsContiguous::eval(strides); +} + +}} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogAddExp.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogAddExp.h new file mode 100644 index 0000000000000000000000000000000000000000..2c3f03718a9d5c08dc5c0fa47bedfe12e2a8fd95 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogAddExp.h @@ -0,0 +1,66 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { + +// custom min and max to be used in logcumsumexp for complex arguments +template +std::pair, c10::complex> _logcumsumexp_minmax(c10::complex x, c10::complex y) { + if (at::_isnan(y)) { // either real is nan or imag is nan + return std::make_pair(y, y); + } else if (at::_isnan(x)) { // either real is nan or imag is nan + return std::make_pair(x, x); + } else { + return (x.real() < y.real()) ? std::make_pair(x, y) : std::make_pair(y, x); + } +} + +template +scalar_t _log_add_exp_helper(scalar_t x, scalar_t y) { + // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp + scalar_t min = at::_isnan(y) ? y : std::min(x, y); // std::min returns first arg if one of the args is nan + scalar_t max = at::_isnan(y) ? y : std::max(x, y); // std::max returns first arg if one of the args is nan + if (min != max || std::isfinite(min)) { + // nan will be propagated here + return std::log1p(std::exp(min - max)) + max; + } else { + // special case to correctly handle infinite cases + return x; + } +} + +template +c10::complex _log_add_exp_helper(const c10::complex& x, const c10::complex& y) { + auto [min, max] = _logcumsumexp_minmax(x, y); + auto min_real = std::real(min); + auto max_real = std::real(max); + + if (at::_isnan(min)) { // either real is nan or imag is nan + // handling the "infectious" NaNs + return {std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN()}; + } else if (!std::isfinite(min_real) && (min_real == max_real)) { + if (min_real < 0) { + // handle the -inf case, the imaginary part here does not really matter as the exp(value) + // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined. + // It does not matter if we're taking the exp of this value + return min; + } else { + // handle the +inf case, we don't need the special precision for log1p for small values + // and to avoid producing nan in case of real(max) == real(min) == +inf + return std::log(std::exp(min) + std::exp(max)); + } + } else { + return std::log1p(std::exp(min - max)) + max; + } +} + +} // end namespace +} //end at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogSoftmaxKernelImpl.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogSoftmaxKernelImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..64c68e9f263bdb816f39bafdff54ea9d3b3a8ef3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogSoftmaxKernelImpl.h @@ -0,0 +1,342 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { +template +int64_t vec_log_softmax_lastdim_chunk_size(int64_t grain_size, int64_t outer_size, int64_t dim_size) { + // Coincidentally, at::internal::GRAIN_SIZE is 32768, which is equal to the + // size of L1D cache on many processors. Some processors have 48 KB L1D cache + // nowadays, so maybe in the future, we can leverage the knowledge of a + // machine's L1D cache size. + int64_t MAX_CHUNK_SIZE = std::max( + 1, + grain_size / (sizeof(scalar_t) * dim_size)); + return std::min(MAX_CHUNK_SIZE, outer_size); +} + +template +void serial_vec_log_softmax_lastdim_range( + const scalar_t* input_data_base, + scalar_t* output_data_base, + int64_t dim_size, + int64_t chunk_size, + int64_t begin, + int64_t end) { + if (end <= begin) { + return; + } + using Vec = vec::Vectorized>; + // MSVC requires such a declaration of dynamic arrays + // Source: https://stackoverflow.com/a/33423538 + auto tmp_sum_scalar = std::make_unique(chunk_size); + auto max_input_arr = std::make_unique(chunk_size); + for (int64_t ii = begin; ii < end; ii += chunk_size) { + int64_t loop_end = chunk_size; + if (ii + chunk_size > end) { + loop_end = end - ii; + } + for (const auto j : c10::irange(loop_end)) { + int64_t i = ii + j; + const scalar_t* input_data = input_data_base + i * dim_size; + max_input_arr[j] = vec::reduce_all( + [](Vec& x, Vec& y) { return vec::maximum(x, y); }, + input_data, + dim_size); + } + for (const auto j : c10::irange(loop_end)) { + int64_t i = ii + j; + const scalar_t* input_data = input_data_base + i * dim_size; + scalar_t max_input = max_input_arr[j]; + tmp_sum_scalar[j] = vec::map_reduce_all( + [max_input](Vec x) { return (x - Vec(max_input)).exp(); }, + [](Vec x, Vec y) { return x + y; }, + input_data, + dim_size); + } + // See [Note AVX-SSE transitions] for why this should call the + // vectorized version (aside from perf improvements). + vec::map( + [](Vec x) { return x.log(); }, + tmp_sum_scalar.get(), + tmp_sum_scalar.get(), + loop_end); + for (const auto j : c10::irange(loop_end)) { + int64_t i = ii + j; + const scalar_t* input_data = input_data_base + i * dim_size; + scalar_t* output_data = output_data_base + i * dim_size; + scalar_t tmp_sum = tmp_sum_scalar[j]; + scalar_t max_input = max_input_arr[j]; + + // It's necessary to keep the order of the operations below. + // In some cases that input is large digits and the difference + // is small, if we compute `max_input` plus `tmp_sum` before, + // there would be a numerical problem. See an example in + // https://github.com/pytorch/pytorch/issues/11752#issuecomment-422883379 + vec::map( + [tmp_sum, max_input](Vec x) { + return x - Vec(max_input) - Vec(tmp_sum); + }, + output_data, + input_data, + dim_size); + } + } +} + +// Can't include ATen/Parallel.h. +// TODO: find a way to have only one copy of divup. +inline int64_t divup(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + +template +std::pair vec_logsoftmax_chunk_size_and_num_chunks(int64_t inner_size, int64_t dim_size) { + using Vec = vec::Vectorized; + int64_t MAX_CHUNK_SIZE = std::max(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); + MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t CHUNK_SIZE = std::min(MAX_CHUNK_SIZE, inner_size); + int64_t num_chunks = divup(inner_size, CHUNK_SIZE); + return {CHUNK_SIZE, num_chunks}; +} + +template +std::enable_if_t>, void> +serial_vec_logsoftmax_range( + const scalar_t* input_data_base, + scalar_t* output_data_base, + int64_t inner_size, + int64_t chunk_size, + int64_t num_chunks, + int64_t dim_size, + int64_t begin, + int64_t end) { + using Vec = vec::Vectorized; + // thread local temp buffer which holds vertical reduction result: max and sum. + auto buffer = std::make_unique(chunk_size * 2); + scalar_t* input_max_data = buffer.get(); + scalar_t* tmp_sum_data = buffer.get() + chunk_size; + + for (int64_t i = begin; i < end; i++) { + int64_t outer_idx = i / num_chunks; + int64_t k = i % num_chunks; + int64_t inner_idx_begin = k * chunk_size; + int64_t size = std::min(chunk_size, inner_size - inner_idx_begin); + + // init + Vec zero_vec = Vec(scalar_t(0)); + Vec min_vec = Vec(-std::numeric_limits::infinity()); + int64_t d0 = 0; + for (; d0 < size - (size % Vec::size()); d0 += Vec::size()) { + min_vec.store(input_max_data + d0); + zero_vec.store(tmp_sum_data + d0); + } + for (; d0 < size; d0++) { + input_max_data[d0] = -std::numeric_limits::infinity(); + tmp_sum_data[d0] = scalar_t(0); + } + + // compute max + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + + dim_idx * inner_size + inner_idx_begin; + + int64_t d1 = 0; + for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) { + Vec data_vec = Vec::loadu(input_ptr + d1); + Vec max_vec = Vec::loadu(input_max_data + d1); + max_vec = Vec::blendv(max_vec, data_vec, data_vec > max_vec); + max_vec.store(input_max_data + d1); + } + for (; d1 < size; d1++) { + scalar_t data_val = input_ptr[d1]; + scalar_t max_val = input_max_data[d1]; + input_max_data[d1] = data_val > max_val ? data_val : max_val; + } + } + + // compute sum of (x - max).exp() + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + + dim_idx * inner_size + inner_idx_begin; + + int64_t d2 = 0; + for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) { + Vec data_vec = Vec::loadu(input_ptr + d2); + Vec sum_vec = Vec::loadu(tmp_sum_data + d2); + Vec max_vec = Vec::loadu(input_max_data + d2); + sum_vec += (data_vec - max_vec).exp(); + sum_vec.store(tmp_sum_data + d2); + } + for (; d2 < size; d2++) { + scalar_t data_val = input_ptr[d2]; + scalar_t max_val = input_max_data[d2]; + tmp_sum_data[d2] += std::exp(data_val - max_val); + } + } + + // apply log + vec::map([](Vec x) { return x.log(); }, tmp_sum_data, tmp_sum_data, size); + + // compute x - max - sum + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + int64_t offset = outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx_begin; + const scalar_t* input_ptr = input_data_base + offset; + scalar_t* output_ptr = output_data_base + offset; + + int64_t d3 = 0; + for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) { + Vec data_vec = Vec::loadu(input_ptr + d3); + Vec max_vec = Vec::loadu(input_max_data + d3); + Vec sum_vec = Vec::loadu(tmp_sum_data + d3); + Vec out_vec = data_vec - max_vec - sum_vec; + out_vec.store(output_ptr + d3); + } + for (; d3 < size; d3++) { + output_ptr[d3] = input_ptr[d3] - input_max_data[d3] - tmp_sum_data[d3]; + } + } + } +} + +template +std::enable_if_t>, void> +serial_vec_logsoftmax_range( + const scalar_t* input_data_base, + scalar_t* output_data_base, + int64_t inner_size, + int64_t chunk_size, + int64_t num_chunks, + int64_t dim_size, + int64_t begin, + int64_t end) { + using Vec = vec::Vectorized; + using fVec = vec::Vectorized; + auto buffer = std::make_unique(chunk_size * 2); + float* input_max_data = buffer.get(); + float* tmp_sum_data = buffer.get() + chunk_size; + + // thread local buffer that holds input data in float32 to save next 2 dtype conversion + auto input_buffer = std::make_unique(dim_size * chunk_size); + float* input_buffer_data = input_buffer.get(); + + // init + for (int64_t i = begin; i < end; i++) { + int64_t outer_idx = i / num_chunks; + int64_t k = i % num_chunks; + int64_t inner_idx_begin = k * chunk_size; + int64_t size = std::min(chunk_size, inner_size - inner_idx_begin); + + fVec zero_fvec = fVec(float(0)); + fVec min_fvec = fVec(-std::numeric_limits::infinity()); + int64_t d0 = 0; + for (; d0 < size - (size % Vec::size()); d0 += Vec::size()) { + min_fvec.store(input_max_data + d0); + min_fvec.store(input_max_data + d0 + fVec::size()); + zero_fvec.store(tmp_sum_data + d0); + zero_fvec.store(tmp_sum_data + d0 + fVec::size()); + } + for (; d0 < size; d0++) { + input_max_data[d0] = -std::numeric_limits::infinity(); + tmp_sum_data[d0] = float(0); + } + + // compute max + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + + dim_idx * inner_size + inner_idx_begin; + float* input_buffer_ptr = input_buffer_data + dim_idx * chunk_size; + + int64_t d1 = 0; + for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) { + Vec data_vec = Vec::loadu(input_ptr + d1); + auto [data_fvec0, data_fvec1] = vec::convert_to_float(data_vec); + fVec max_fvec0 = fVec::loadu(input_max_data + d1); + fVec max_fvec1 = fVec::loadu(input_max_data + d1 + fVec::size()); + max_fvec0 = fVec::blendv(max_fvec0, data_fvec0, data_fvec0 > max_fvec0); + max_fvec1 = fVec::blendv(max_fvec1, data_fvec1, data_fvec1 > max_fvec1); + max_fvec0.store(input_max_data + d1); + max_fvec1.store(input_max_data + d1 + fVec::size()); + + // cache the 'converted' float input + data_fvec0.store(input_buffer_ptr + d1); + data_fvec1.store(input_buffer_ptr + d1 + fVec::size()); + } + for (; d1 < size; d1++) { + float data_val = float(input_ptr[d1]); + float max_val = input_max_data[d1]; + input_max_data[d1] = data_val > max_val ? data_val : max_val; + input_buffer_ptr[d1] = data_val; + } + } + + // compute sum of (x - max).exp() + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + float* input_buffer_ptr = input_buffer_data + dim_idx * chunk_size; + + int64_t d2 = 0; + for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) { + fVec data_fvec0 = fVec::loadu(input_buffer_ptr + d2); + fVec data_fvec1 = fVec::loadu(input_buffer_ptr + d2 + fVec::size()); + fVec sum_fvec0 = fVec::loadu(tmp_sum_data + d2); + fVec sum_fvec1 = fVec::loadu(tmp_sum_data + d2 + fVec::size()); + fVec max_fvec0 = fVec::loadu(input_max_data + d2); + fVec max_fvec1 = fVec::loadu(input_max_data + d2 + fVec::size()); + sum_fvec0 += (data_fvec0 - max_fvec0).exp(); + sum_fvec1 += (data_fvec1 - max_fvec1).exp(); + sum_fvec0.store(tmp_sum_data + d2); + sum_fvec1.store(tmp_sum_data + d2 + fVec::size()); + } + for (; d2 < size; d2++) { + float data_val = input_buffer_ptr[d2]; + float max_val = input_max_data[d2]; + tmp_sum_data[d2] += std::exp(data_val - max_val); + } + } + + // apply log + vec::map([](fVec x) { return x.log(); }, tmp_sum_data, tmp_sum_data, size); + + // compute x - max - sum + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + float* input_buffer_ptr = input_buffer_data + dim_idx * chunk_size; + scalar_t* output_ptr = output_data_base + outer_idx * dim_size * inner_size + + dim_idx * inner_size + inner_idx_begin; + + int64_t d3 = 0; + for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) { + fVec data_fvec0 = fVec::loadu(input_buffer_ptr + d3); + fVec data_fvec1 = fVec::loadu(input_buffer_ptr + d3 + fVec::size()); + fVec max_fvec0 = fVec::loadu(input_max_data + d3); + fVec max_fvec1 = fVec::loadu(input_max_data + d3 + fVec::size()); + fVec sum_fvec0 = fVec::loadu(tmp_sum_data + d3); + fVec sum_fvec1 = fVec::loadu(tmp_sum_data + d3 + fVec::size()); + fVec out_fvec0 = data_fvec0 - max_fvec0 - sum_fvec0; + fVec out_fvec1 = data_fvec1 - max_fvec1 - sum_fvec1; + Vec out_vec = vec::convert_from_float(out_fvec0, out_fvec1); + out_vec.store(output_ptr + d3); + } + for (; d3 < size; d3++) { + output_ptr[d3] = scalar_t(input_buffer_ptr[d3] - input_max_data[d3] - tmp_sum_data[d3]); + } + } + } +} // namespace CPU_CAPABILITY +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Loops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Loops.h new file mode 100644 index 0000000000000000000000000000000000000000..b692065e32066ac9831af81ef5946ce4a9bb858d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Loops.h @@ -0,0 +1,400 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// This file provides two functions to help write elementwise kernels: +// +// cpu_kernel(TensorIterator iter, ) +// cpu_kernel_vec(TensorIterator iter, , ) +// +// Both functions may generate vectorized code. The cpu_kernel implementation +// relies on the compiler's auto-vectorization. The cpu_kernel_vec +// implementation uses x86 SIMD intrinsics when available. These functions +// are only intended to be used in the ATen/native/cpu subdirectory, since files +// in other directories are not compiled with AVX/AVX2 enabled. See README.md +// for more details. +// +// For example, to write a multiplication kernel for float: +// +// cpu_kernel(iter, [](float a, float b) { return a * b; }); +// +// Or you may write: +// +// cpu_kernel_vec(iter, +// [](float a, float b) { return a * b; }, +// [](Vectorized a, Vectorized b) { return a * b; }); +// +// See BinaryOpsKernel.cpp for the complete implementation +// +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at::native { inline namespace CPU_CAPABILITY { + +using namespace vec; + +template +typename traits::ArgsTuple +dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, + std::index_sequence /*unused*/) { + return std::make_tuple( + c10::load::type>( + data[INDEX] + i * strides[INDEX])...); +} + +template +typename traits::ArgsTuple +dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) { + using Indices = std::make_index_sequence; + return dereference_impl(data, strides, i, Indices{}); +} + +template +typename traits::ArgsTuple +dereference_vec_impl(char* C10_RESTRICT data[], + const typename traits::result_type& opt_scalar, + size_t S, + int64_t i, + std::index_sequence /*unused*/) { + using Vec = typename traits::result_type; + using scalar_t = typename Vec::value_type; + return std::make_tuple( + S == INDEX + 1 ? + opt_scalar : + Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...); +} + +template +typename traits::ArgsTuple +dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) { + using Indices = std::make_index_sequence; + return dereference_vec_impl(data, opt_scalar, S, i, Indices{}); +} + +template ::result_type>>* = nullptr> +inline void +execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + using result_type = typename traits::result_type; + for (; i < n; i++) { + result_type* out_ptr = (result_type*)(data[0] + i * strides[0]); + *out_ptr = std::apply(op, dereference( + &data[1], + &strides[1], + i)); + } +} + +template ::result_type>>* = nullptr> +inline void +execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + for (; i < n; i++) { + std::apply(op, dereference( + &data[0], + &strides[0], + i)); + } +} + +// Basic loop operation (one output, N inputs). May be auto-vectorized +// by the compiler. Supports inputs and outputs of different types. +template +inline void +basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + constexpr int ntensors = traits::arity + 1; + + // Copying strides to temporary array helps auto vectorization in older GCC + // versions. + int64_t strides[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + strides[arg] = strides_[arg]; + } + + execute_op(data, strides, i, n, std::forward(op)); +} + +// the recursive variadic template for iterating over the returned tuple +template +struct TupleOutput { + static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i, + const T &tuple) { + TupleOutput::handle(data, strides, i, tuple); + + auto output = std::get(tuple); + using output_type = decltype(output); + output_type * out_ptr = (output_type *)(data[N - 1] + i * strides[N - 1]); + *out_ptr = output; + } +}; + +// Base case for the above recursive template +template +struct TupleOutput { + static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i, + const T &tuple) { + auto output = std::get<0>(tuple); + using output_type = decltype(output); + output_type* out_ptr = (output_type *)(data[0] + i * strides[0]); + *out_ptr = output; + } +}; + +template +void handle_tuple_outputs(char* C10_RESTRICT data[], + const int64_t* strides, + int64_t i, + const std::tuple &tuple) { + TupleOutput::handle(data, strides, i, tuple); +} + +// Loop operation for `cpu_kernel_multiple_outputs`. +// 1. Use `std::apply` to make dynamic method invocation +// for the lambda passed in `cpu_kernel_multiple_outputs`. +// 2. Iterate over the members of the returned tuple, set the corresponding +// output tensor by the tuple member in `handle_tuple_outputs` function. +template +inline void +multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + + using result_type = typename traits::result_type; + constexpr int num_outputs = std::tuple_size_v; + constexpr int ntensors = traits::arity + num_outputs; + + // Copying strides to temporary array helps auto vectorization in older GCC + // versions. + int64_t strides[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + strides[arg] = strides_[arg]; + } + + for (; i < n; i++) { + auto output = std::apply(op, dereference( + &data[num_outputs], + &strides[num_outputs], + i)); + handle_tuple_outputs(data, strides, i, output); + } +} + +// Explicitly vectorized loop implementation. All inputs and outputs must be +// the same type and contiguous with one exception: a single input may be +// a scalar (stride 0). It's position is indicated by the argument `S`. If `S` +// is 0, then there are no scalar inputs. +template +inline void +vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) { + using traits = function_traits; + using scalar_t = typename function_traits::result_type; + using Vec = Vectorized; + constexpr int ntensors = traits::arity + 1; + + char* C10_RESTRICT data[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + data[arg] = data_[arg]; + } + + Vec opt_scalar = Vec(S > 0 ? c10::load((scalar_t*)data[S]) : scalar_t(0)); + int64_t i = 0; + for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { + auto args1 = dereference_vec(&data[1], opt_scalar, S, i); + auto args2 = dereference_vec(&data[1], opt_scalar, S, i + Vec::size()); + auto out1 = std::apply(vop, std::move(args1)); + auto out2 = std::apply(vop, std::move(args2)); + out1.store(data[0] + i * sizeof(scalar_t)); + out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t)); + } + if (i < n) { + int64_t strides[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t); + } + basic_loop(data, strides, i, n, std::forward(op)); + } +} + + +template +inline void unroll_contiguous_scalar_checks( + const int64_t* /*strides*/, + std::index_sequence<> /*unused*/, + cb_t&& cb) { + cb(0); +} + +template +inline void unroll_contiguous_scalar_checks( + const int64_t* strides, + std::index_sequence /*unused*/, + cb_t&& cb) { + if (is_contiguous_scalar(strides)) { + cb(INDEX0 + 1); + } else { + unroll_contiguous_scalar_checks(strides, std::index_sequence{}, std::forward(cb)); + } +} + +template +struct VectorizedLoop2d { + op_t op; + vop_t vop; + + using traits = function_traits; + static constexpr int ntensors = traits::arity + 1; + using data_t = std::array; + + VectorizedLoop2d(op_t op, vop_t vop): + op(std::move(op)), vop(std::move(vop)) {} + + static void advance(data_t &data, const int64_t *outer_strides) { + for (const auto arg : c10::irange(data.size())) { + data[arg] += outer_strides[arg]; + } + } + + void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) { + data_t data; + std::copy_n(base, ntensors, data.data()); + const int64_t *outer_strides = &strides[ntensors]; + + if (is_contiguous(strides)) { + for ([[maybe_unused]] const auto i : c10::irange(size1)) { + vectorized_loop(data.data(), size0, 0, op, vop); + advance(data, outer_strides); + } + } else { + using Indices = std::make_index_sequence; + unroll_contiguous_scalar_checks(strides, Indices{}, [&](size_t idx) { + if (idx) { + for ([[maybe_unused]] const auto i : c10::irange(size1)) { + vectorized_loop(data.data(), size0, idx, op, vop); + advance(data, outer_strides); + } + } else { + for ([[maybe_unused]] const auto i : c10::irange(size1)) { + basic_loop(data.data(), strides, 0, size0, op); + advance(data, outer_strides); + } + } + }); + } + } +}; + +template +VectorizedLoop2d make_vectorized_loop2d( + op_t &&op, vop_t &&vop) { + return VectorizedLoop2d(std::forward(op), std::forward(vop)); +} + +template +void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) { + using traits = function_traits; + // this could be extended to work with void return types + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + // dynamic casting not currently supported on CPU + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + iter.for_each([&](char** data, const int64_t* strides, int64_t n) { + // basic loop can handle 1d slices with arbitrary strides, and 1d slices is all that + // iter.for_each is ever sending to the loop lambda + basic_loop(data, strides, 0, n, op); + }, grain_size); + iter.cast_outputs(); +} + +// This function helps write elementwise kernels that requires multiple outputs. +// It follows the similar structure of cpu_kernel. +// Instead of `basic_loop` function, a new `multiple_outputs_loop` function is +// manipulated to handle multiple return values. +// For now `needs_dynamic_casting` check is not added as the passed lambda (`func_t`) +// of `multiple_outputs_loop` returns `std::tuple` instead of `scalar_t`. +// The `gpu_kernel_multiple_outputs` is also implemented without this check, +// We could extend `needs_dynamic_casting` to support both `std::tuple` and +// `thrust::tuple` in the future. +template +void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) { + using traits = function_traits; + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + + iter.for_each([&](char** data, const int64_t* strides, int64_t n) { + multiple_outputs_loop(data, strides, 0, n, op); + }, grain_size); + iter.cast_outputs(); +} + +template +void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) { + using traits = function_traits; + // this could be extended to work with void return types + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + // dynamic casting not currently supported on CPU, but some kernels (like Fill) + // explicitly dynamic_cast, so we give the opt-out of checking. + if constexpr (check_dynamic_cast) { + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + } + + iter.for_each(make_vectorized_loop2d(std::forward(op), std::forward(vop)), grain_size); + iter.cast_outputs(); +} + +template +void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) { + using traits = function_traits; + constexpr bool result_void = std::is_void_v; + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity && + ((result_void && iter.noutputs() == 0) || (!result_void && iter.noutputs() == 1))); + // dynamic casting not currently supported on CPU + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) { + basic_loop(data, strides, 0, n, op); + }, range); + iter.cast_outputs(); +} + +template +void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) { + cpu_serial_kernel(iter, std::forward(op), {0, iter.numel()}); +} + +template +void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) { + using traits = function_traits; + // this could be extended to work with void return types + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + // dynamic casting not currently supported on CPU + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + iter.serial_for_each(make_vectorized_loop2d(std::forward(op), std::forward(vop)), range); + iter.cast_outputs(); +} + +template +void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) { + cpu_serial_kernel_vec(iter, std::forward(op), std::forward(vop), {0, iter.numel()}); +} + +}} // namespace at::native:: + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..1f43c4ee1ee3a3e9a220b4032f5b9bd945c846ed --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class Tensor; + +namespace native { + +using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&); + +DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel) +DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel) + +}} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..760d9af1c2731eab1111faa7cfadbaed38317f17 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +using pixel_shuffle_fn = void(*)(TensorBase&, const TensorBase&, int64_t); +DECLARE_DISPATCH(pixel_shuffle_fn, pixel_shuffle_kernel) +DECLARE_DISPATCH(pixel_shuffle_fn, pixel_unshuffle_kernel) + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Reduce.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..6af3a57749a51e46a6a536d164b2a1218e5bb269 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Reduce.h @@ -0,0 +1,315 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace at::native { inline namespace CPU_CAPABILITY { + +using namespace vec; + +#define VEC_LOOP_HEADER(func_t, data) \ + using scalar_t = typename function_traits::result_type; \ + using Vec = Vectorized; \ + char* out_ptr = data[0]; \ + (void) out_ptr; + +// reduction that is contiguous over the input in dim 0 +template +inline bool is_contiguous_reduction(const int64_t* strides) { + return strides[0] == 0 && + strides[1] == sizeof(typename traits::arg2_t); +} + +// reduction that is contiguous over the input in dim 1 +template +inline bool is_outer_reduction(const int64_t* strides) { + return strides[0] == 0 && + strides[2] == sizeof(typename traits::result_type) && + strides[3] == sizeof(typename traits::arg2_t); +} + +template +inline void vectorized_reduction(char** data, int64_t n, int64_t stride, + func_t op, vec_func_t vop, bool reduce) { + VEC_LOOP_HEADER(func_t, data) + const char* in1_ptr = data[1]; + Vec acc[4]; + for (const auto j : c10::irange(4)) { + acc[j] = Vec::loadu(in1_ptr + j * Vec::size() * sizeof(scalar_t)); + } + for (const auto i : c10::irange(1, n)) { + const char* ptr = in1_ptr + stride * i; + acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t)))); + acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t)))); + acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t)))); + acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t)))); + } + if (reduce) { + scalar_t buffer[Vec::size()]; + acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3])); + acc[0].store(buffer); + for (const auto j : c10::irange(1, Vec::size())) { + buffer[0] = op(buffer[0], buffer[j]); + } + auto dst = (scalar_t*)out_ptr; + *dst = op(*dst, buffer[0]); + } else { + for (const auto j : c10::irange(4)) { + auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t); + acc[j] = vop(acc[j], Vec::loadu(dst)); + acc[j].store(dst); + } + } +} + +template +inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) { + for ([[maybe_unused]] const auto j : c10::irange(n)) { + f(); + data[0] += strides[0]; + data[1] += strides[1]; + } +} + +// computes the reduction out = op(out, in) +template +inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) { + VEC_LOOP_HEADER(func_t, data) + constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t); + int64_t count = n / (4 * Vec::size()); + if (count > 0) { + vectorized_reduction(data, count, vector_stride, op, vop, /*reduce=*/true); + } + char* ptrs[3] = { data[0], data[0], data[1] }; + int64_t strides[] = { 0, 0, sizeof(scalar_t) }; + basic_loop(ptrs, strides, count * 4 * Vec::size(), n, op); +} + +// computes the reduction out = op(out, in) +template +inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) { + VEC_LOOP_HEADER(func_t, data) + + // reduce down each column of 4 * Vec::size() elements. + constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t); + int64_t outer_stride[2] = { vector_stride, vector_stride }; + UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] { + vectorized_reduction(data, size0, inner_stride, op, vop, /*reduce=*/false); + }); + + // reduce down the remaining columns + int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) }; + int64_t remaining = size1 % (4 * Vec::size()); + UNARY_OUTER_LOOP(data, step, remaining, [&] { + char* ptrs[3] = { data[0], data[0], data[1] }; + int64_t strides[] = { 0, 0, inner_stride }; + basic_loop(ptrs, strides, 0, size0, op); + }); +} + +template +static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) { + // static_assert(std::is_same_v, "data types must match"); + if (index < num_outputs) { + char *out = (char *) iter.data_ptr(index); + *(res_t *) out = result; + } +} + +template +static void set_results(const res_t result, const TensorIteratorBase &iter, const int num_outputs) { + AT_ASSERT(num_outputs == 1); + set_result(0, result, iter, num_outputs); +} + +template +inline std::enable_if_t +for_each_in_tuple(const std::tuple& /*t*/, const TensorIteratorBase& /*iter*/, const int /*num_outputs*/) { + return i; +} + +template +inline std::enable_if_t +for_each_in_tuple(const std::tuple& t, const TensorIteratorBase &iter, const int num_outputs) { + if (i < (size_t)num_outputs) { + set_result(i, std::get(t), iter, num_outputs); + return for_each_in_tuple(t, iter, num_outputs); + } + return i; +} + +template +static void set_results(const std::tuple& result, const TensorIteratorBase &iter, const int num_outputs) { + AT_ASSERT(num_outputs >= 1); + std::size_t result_size = for_each_in_tuple(result, iter, num_outputs); + AT_ASSERT((size_t)num_outputs == result_size); +} + +template +struct all_same : std::conjunction< + std::is_same... +> {}; + +// data_t is the input/output data type. +// acc_t is a type that contains all the necessary data +// to continue reducing. +// index_t is a one-dimensional index +// +// ops_t is such that &ops_t::reduce, &ops_t::combine, and &ops_t::project exist and satisfy +// the following. +// reduce: (acc_t, data_t, index_t) -> acc_t adds one data point to the accumulated value. +// combine: (acc_t, acc_t) -> acc_t combines two accumulated values into one. +// project: acc_t -> out_t finishes the reduction, getting the required output. +// +// Additionally, acc_t must be default-constructible: +// acc_t {} is an identity for combine, +// and project(acc_t {}) is the value of the operation on zero elements. +// +// The point of `combine` is to support parallelization - +// the idea is to one sequence of `reduce` calls per thread of execution, +// and then to combine them at the end with `combine`. +// +// If there is more than one output element, +// our parallelization strategy is to use one thread for each of them, +// which means that `combine` will never be called. +// +// If, on the other hand, there is only one, then we split the input into +// into several pieces, reduce each separately, and then combine them. + +template +void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) { + using rf_t = decltype(&ops_t::reduce); + using cf_t = decltype(&ops_t::combine); + using pf_t = decltype(&ops_t::project); + using r_traits = binary_function_traits; + using c_traits = binary_function_traits; + using p_traits = unary_function_traits; + using acc_t = typename p_traits::arg1_t; + using data_t = typename r_traits::arg2_t; + static_assert( + all_same< + acc_t, + init_t, + typename r_traits::arg1_t, + typename r_traits::result_type, + typename c_traits::arg1_t, + typename c_traits::arg2_t, + typename c_traits::result_type>::value, + "all accumulate types must match"); + static_assert( + std::is_default_constructible_v, + "the accumulate type must be default-constructible" + ); + const int num_outputs = iter.noutputs(); + iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIteratorBase &sub_iter) { + auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t { + int ntensors = sub_iter.ntensors(); + sub_iter.serial_for_each([&acc, &ops, num_outputs, ntensors, begin](char** data, const int64_t* strides, int64_t size) { + AT_ASSERT(ntensors - num_outputs == 1); + char *in = data[ntensors - 1]; + int64_t stride = strides[ntensors - 1]; + for (const auto i : c10::irange(size)) { + acc = ops.reduce(acc, c10::load(in), begin + i); + in += stride; + } + }, {begin, end}); + return ops.translate_idx(acc, sub_iter.view_offsets()[0]); + }; + acc_t total_acc = init; + auto numel = sub_iter.numel(); + if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 || + at::in_parallel_region()) { + total_acc = reduction_body(total_acc, 0, numel); + } else { + int max_threads = at::get_num_threads(); + AT_ASSERT(max_threads > 0); + static_assert( + !std::is_same_v, + "Concurrently modifying different references into std::vector is UB." + ); + std::vector buffer((unsigned)max_threads, init); + at::parallel_for(0, numel, internal::GRAIN_SIZE, + [&](int64_t begin, int64_t end) { + auto& acc = buffer[at::get_thread_num()]; + acc = reduction_body(acc, begin, end); + } + ); + for (const auto i : c10::irange(max_threads)) { + total_acc = ops.combine(total_acc, buffer[i]); + } + } + set_results(ops.project(total_acc), sub_iter, num_outputs); + }); +} + +template +void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) { + using traits = binary_function_traits; + static_assert( + all_same< + typename traits::result_type, + typename traits::arg1_t, + typename traits::arg2_t>::value, + "all types must match"); + + iter.output_base().fill_(ident); + iter.parallel_reduce([&](char** data, const int64_t* strides, int64_t size0, int64_t size1) { + int64_t outer_strides[] = { strides[2], strides[3] }; + if (is_contiguous_reduction(strides)) { + // input is contiguous in dim 0, output is reduced in dim 0 + UNARY_OUTER_LOOP(data, outer_strides, size1, [&] { + vectorized_inner_reduction(data, size0, op, vop); + }); + } else if (is_outer_reduction(strides)) { + // input and output are contiguous in dim 1 + int64_t inner_stride = strides[1]; // stride of input in dim 0 + vectorized_outer_reduction(data, inner_stride, size0, size1, op, vop); + } else { + UNARY_OUTER_LOOP(data, outer_strides, size1, [&] { + char* ptrs[3] = { data[0], data[0], data[1] }; + int64_t inner_strides[3] = { strides[0], strides[0], strides[1] }; + basic_loop(ptrs, inner_strides, 0, size0, op); + }); + } + }); +} + +// when reduction is on most inner dimension (dim 0 in TensorIterator) +// and input has contiguous most inner dimension, `binary_kernel_reduce_lastdim` +// can be used. +inline bool is_reduce_lastdim(TensorIteratorBase& iter) { + return iter.num_reduce_dims() == 1 && iter.is_dim_reduced(0) + && iter.ninputs() == 1 && iter.strides(1)[0] == iter.element_size(1); +} + +template +void binary_kernel_reduce_lastdim(TensorIteratorBase& iter, reduce_func_t reduce_op) { + auto shape = iter.shape(); + int64_t dim_size = shape[0]; + int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / dim_size); + TensorIterator sub_iter(iter); + // create sub iterator to parallel on all non-reduce-dims + sub_iter.narrow(0, 0, 1); + auto loop = [&](char** data, const int64_t* strides, int64_t size) { + char* out = data[0]; + char* in = data[1]; + for (int64_t i = 0; i < size; ++i) { + reduce_op(out, in, dim_size); + out += strides[0]; + in += strides[1]; + } + }; + sub_iter.for_each(loop, grain_size); +} + +}} // namespace at::native:: + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..ea176a6c61667c37332a4b28b1734b605bb93c56 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h @@ -0,0 +1,242 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { + +using namespace vec; + +#define AT_DISPATCH_REDUCTION_TYPES(op, ...) \ + [&] { \ + switch (op) { \ + case ReductionType::SUM: { \ + static constexpr auto reduce = ReductionType::SUM; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::MEAN: { \ + static constexpr auto reduce = ReductionType::MEAN; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::MIN: { \ + static constexpr auto reduce = ReductionType::MIN; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::MAX: { \ + static constexpr auto reduce = ReductionType::MAX; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::PROD: { \ + static constexpr auto reduce = ReductionType::PROD; \ + return __VA_ARGS__(); \ + } \ + } \ + }() + +template +inline vec_scalar_t init_value() { + using acc_t = vec_scalar_t; + acc_t val; + if (reduce == ReductionType::SUM || + reduce == ReductionType::MEAN) { + val = static_cast(0); + } else if (reduce == ReductionType::PROD) { + val = static_cast(1); + } else if (reduce == ReductionType::MAX) { + val = -std::numeric_limits::infinity(); + } else { + TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN); + val = std::numeric_limits::infinity(); + } + return val; +} + +template +inline vec_scalar_t init_value(const std::optional& initial) { + using acc_t = vec_scalar_t; + if (initial.has_value()) { + return initial.value().to(); + } else { + return init_value(); + } +} + +template +inline void init(scalar_t* out, int64_t size, const vec_scalar_t& val) { + using Vec = Vectorized>; + map( + [val](Vec x) { return Vec(val); }, + out, + out, + size); +} + +template +inline void init(scalar_t* out, int64_t size, const std::optional& initial) { + using acc_t = vec_scalar_t; + acc_t val = init_value(initial); + init(out, size, val); +} + +// overload with `include_self`, used by scatter_reduce +template +inline void init(scalar_t* out, int64_t size, bool include_self = false) { + using acc_t = vec_scalar_t; + if (!include_self) { + acc_t val = init_value(); + init(out, size, val); + } +} + +template +inline void _init(scalar_t* self_ptr, at::opmath_type* buffer_ptr, int64_t size, bool include_self) { + if (!include_self) { + init, reduce>(buffer_ptr, size, include_self); + } else { + vec::convert(self_ptr, buffer_ptr, size); + } +} + +template +inline std::enable_if_t, scalar_t> +_max(const scalar_t& x, const scalar_t& y) { + return at::_isnan(y) ? y : std::max(x, y); +} + +template +inline Vectorized _max(const Vectorized& x, const Vectorized& y) { + // vec::maximum propagates NaN + return vec::maximum(x, y); +} + +template +inline std::enable_if_t, Vec2> +_max(const vec_t& x, const vec_t& y) { + // vec::maximum propagates NaN + return maximum(x, y); +} + +template +inline std::enable_if_t, scalar_t> +_min(const scalar_t& x, const scalar_t& y) { + return at::_isnan(y) ? y : std::min(x, y); +} + +template +inline Vectorized _min(const Vectorized& x, const Vectorized& y) { + // vec::minimum propagates NaN + return vec::minimum(x, y); +} + +template +inline std::enable_if_t, Vec2> +_min(const vec_t& x, const vec_t& y) { + // vec::minimum propagates NaN + return minimum(x, y); +} + +template , int> = 0> +inline void map_acc( + const Op& vec_fun, + accumut* output_data, + const accumut* input_data, + const scalar_t* input_data2, + int64_t size) { + using Vec = vec::Vectorized; + using aVec = vec::Vectorized; + int64_t d = 0; + constexpr int64_t kVecSize = Vec::size(); + constexpr int64_t kaVecSize = aVec::size(); + for (d = 0; d < size - (size % kVecSize); d += kVecSize) { + Vec data2_vec = Vec::loadu(input_data2 + d); + auto [data2_avec0, data2_avec1] = convert_to_float(data2_vec); + aVec input_vec0 = aVec::loadu(input_data + d); + aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize); + vec_fun(input_vec0, data2_avec0).store(output_data + d); + vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize); + } + if (size - d > 0) { + int64_t tail_size = size - d; + Vec data2_vec = Vec::loadu(input_data2 + d, tail_size); + auto [data2_avec0, data2_avec1] = convert_to_float(data2_vec); + if (tail_size > kaVecSize) { + aVec input_vec0 = aVec::loadu(input_data + d); + aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize, tail_size - kaVecSize); + vec_fun(input_vec0, data2_avec0).store(output_data + d); + vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize, tail_size - kaVecSize); + } else { + aVec input_vec0 = aVec::loadu(input_data + d, tail_size); + vec_fun(input_vec0, data2_avec0).store(output_data + d, tail_size); + } + } +} + +// for Max and Min, propagate NaN: +template +inline T update(const T& x, const T& y) { + if (reduce == ReductionType::SUM || + reduce == ReductionType::MEAN) { + return x + y; + } else if (reduce == ReductionType::PROD) { + return x * y; + } else if (reduce == ReductionType::MAX) { + return _max(x, y); + } else { + TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN); + return _min(x, y); + } +} + +template +inline void update(scalar_t* out, const scalar_t* data, int64_t K) { + using Vec = vec::Vectorized>; + map2( + [](Vec x, Vec y) { return update(x, y); }, + out, + out, + data, + K); +} + +template , int> = 0> +inline void update(at::opmath_type* out, const scalar_t* data, int64_t K) { + using opmath_t = at::opmath_type; + using Vec = vec::Vectorized; + map_acc( + [](Vec x, Vec y) { return update(x, y); }, + out, + out, + data, + K); +} + +template +inline void write(scalar_t* out, int64_t count, int64_t K) { + using Vec = vec::Vectorized>; + if (reduce == ReductionType::MEAN) { + if (count > 0) { + vec::map( + [count](Vec x) { return x / Vec(count); }, + out, + out, + K); + } + } +} + +} // namespace CPU_CAPABILITY +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f34904a05d3df09a2cf7360cf047b8f775f792c1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { +#if !defined(C10_MOBILE) +using fp16_gemv_fn = void(*)(int, int, float, const Half*, int, const Half*, int, float, Half*, int); +DECLARE_DISPATCH(fp16_gemv_fn, fp16_gemv_trans_stub) + +using bf16_gemv_fn = void(*)(int, int, BFloat16, const BFloat16*, int, const BFloat16*, int, BFloat16, BFloat16*, int); +DECLARE_DISPATCH(bf16_gemv_fn, bf16_gemv_trans_stub) + +using fp16_dot_fn = float(*)(const int64_t, const Half*, const int64_t, const Half*, const int64_t); +DECLARE_DISPATCH(fp16_dot_fn, fp16_dot_stub) + +using bf16_dot_fn = float(*)(const int64_t, const BFloat16*, const int64_t, const BFloat16*, const int64_t); +DECLARE_DISPATCH(bf16_dot_fn, bf16_dot_stub) + +inline namespace CPU_CAPABILITY { +float fp16_dot_with_fp32_arith(const Half* vec1, const Half* vec2, int64_t len); +float bf16_dot_with_fp32_arith(const BFloat16* vec1, const BFloat16* vec2, int64_t len); +} // inline namespace CPU_CAPABILITY +#endif // !defined(C10_MOBILE) +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..964f5aad7a1c4633ee8211ab809e654ec4127108 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +using sampled_addmm_sparse_csr_fn = void(*)(const Tensor&, const Tensor&, const Scalar&, const Scalar&, const Tensor&); + +DECLARE_DISPATCH(sampled_addmm_sparse_csr_fn, sampled_addmm_sparse_csr_stub) + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..0f401b131084661922ced32a901a33e0022d6005 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h @@ -0,0 +1,151 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright 2004-present Facebook. All Rights Reserved. +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +namespace at::native::detail { + +struct InputMeta { + void* data_ptr; + int64_t inner_size; + + InputMeta(const Tensor& t, int64_t dim, int64_t inner) + : data_ptr(t.data_ptr()), inner_size(t.sizes()[dim] * inner) {} +}; + +// This kernel is used by two TensorList types: +// 1. stack_serial_kernel uses at::ArrayRef +// 2. Static runtime calls this kernel directly (csrc/jit/runtime/static/ops.cpp) with +// ProcessedNodeInputWrapper. +// When making changes, make sure that they are compatible with both types! +template +void stack_serial_kernel_impl(Tensor& result, TensorListType tensors, int64_t dim) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + dim >= 0 && dim <= result.dim(), + "dim out of range in stack_serial_kernel_impl"); + int64_t outer = + result.numel() / (result.sizes()[dim] * result.strides()[dim]); + scalar_t* result_data = result.data_ptr(); + int64_t ninputs = tensors.size(); + std::vector inputs; + inputs.reserve(ninputs); + for (const auto& tensor : tensors) { + inputs.emplace_back(tensor, dim, tensor.strides()[dim]); + } + + using Vec = vec::Vectorized; + scalar_t* result_ptr = result_data; + for (const auto i : c10::irange(outer)) { + for (const auto j : c10::irange(ninputs)) { + int64_t local_inner = inputs[j].inner_size; + scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner; + + if (local_inner < Vec::size()) { + for (const auto k : c10::irange(local_inner)) { + result_ptr[k] = input_ptr[k]; + } + } else { + vec::map( + [](Vec x) { return x; }, result_ptr, input_ptr, local_inner); + } + result_ptr += local_inner; + } + } +} + +// Checks to see whether native stack can be invoked under these conditions: +// - result and input tensors are contiguous +// - only one thread is used +// - no type promotion has to occur +// - tensors dtype is Double or Float +template +bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, int64_t dim) { + TORCH_CHECK(!tensors.empty(), "expected a non-empty list of Tensors"); + const Tensor& first_tensor = tensors[0]; + // stack dimension should be in range [0,firstTensor.dim()) + // dim == firstTensor.dim() is a valid input, but it is handled by default code path + // that uses unsqueeze + if (dim >= first_tensor.dim()) return false; + // Native stack doesn't apply any tensor is skipped. + if (first_tensor.numel() == 0 && first_tensor.dim() == 1) return false; + // there should be no type promotion + if (result.dtype() != first_tensor.dtype()) return false; + + auto first_tensor_mem_format = first_tensor.suggest_memory_format(); + ScalarType dtype = first_tensor.scalar_type(); + + if (!result.is_contiguous(first_tensor_mem_format)) { + return false; + } + + // fast path only works for Double and Float + if (dtype != ScalarType::Double && dtype != ScalarType::Float) { + return false; + } + + // check remainder of inputs +#ifndef STRIP_ERROR_MESSAGES + auto const &first_tensor_shape = first_tensor.sizes(); +#endif + for (const auto i : c10::irange(1, tensors.size())) { + auto const &tensor = tensors[i]; + TORCH_CHECK(tensors[i].sizes() == first_tensor.sizes(), + "stack expects each tensor to be equal size, but got ", first_tensor_shape, + " at entry 0 and ", tensor.sizes(), " at entry ", i); + + // every tensor must be contiguous + // tensor sizes and strides must be the same + // there should be no type promotion + if (!tensor.is_contiguous(first_tensor_mem_format) || + tensor.strides() != first_tensor.strides() || + tensor.dtype() != dtype) { + return false; + } + } + + // fast native stack should only be used when it is not worth using multiple threads + // or there is only one thread. Note that we aren't checking result.numel() here because + // it may not have been resized and we want to defer that cost till later. + int64_t numel_in_stack = first_tensor.numel() * tensors.size(); + return numel_in_stack < at::internal::GRAIN_SIZE || at::get_num_threads() == 1; +} + +template +struct CanUseNativeSerialStack; + +template +struct CanUseNativeSerialStack { + static bool call(Tensor& result, TensorListType tensors, int64_t dim) { + // Inputs cannot alias the output tensor + for (const auto i : c10::irange(tensors.size())) { + auto lap = at::get_overlap_status(result, tensors[i]); + TORCH_CHECK(lap != at::MemOverlapStatus::Partial && + lap != at::MemOverlapStatus::Full, 0, + "unsupported operation: the input tensors cannot refer to any of the " + "output memory locations. Found overlap in input tensor ", i); + } + + return can_use_native_serial_stack_impl(result, tensors, dim); + } +}; + +template +struct CanUseNativeSerialStack { + static bool call(Tensor& result, TensorListType tensors, int64_t dim) { + return can_use_native_serial_stack_impl(result, tensors, dim); + } +}; + +} // namespace at::native::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..35d9d0a33c49adfbfd7b77a957552594c8b51c0f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at { +class Tensor; + +namespace native { + +using forward_fn = void (*)(const Tensor&, const Tensor&); +using backward_fn = void(*)(const Tensor &, const Tensor &, const Tensor&); + +DECLARE_DISPATCH(forward_fn, softmax_lastdim_kernel) +DECLARE_DISPATCH(forward_fn, log_softmax_lastdim_kernel) +DECLARE_DISPATCH(backward_fn, softmax_backward_lastdim_kernel) +DECLARE_DISPATCH(backward_fn, log_softmax_backward_lastdim_kernel) + +using forward_fn_with_dim = void(*)(const Tensor &, const Tensor &, const int64_t); +using backward_fn_with_dim = + void (*)(const Tensor&, const Tensor&, const Tensor&, const int64_t); + +DECLARE_DISPATCH(forward_fn_with_dim, softmax_kernel) +DECLARE_DISPATCH(forward_fn_with_dim, log_softmax_kernel) +DECLARE_DISPATCH(backward_fn_with_dim, softmax_backward_kernel) +DECLARE_DISPATCH(backward_fn_with_dim, log_softmax_backward_kernel) +} +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8cca5fdca9252e05f56751278a309a68760c1bb6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +using spmm_reduce_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); +using spmm_reduce_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); +using spmm_reduce_backward_input_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); +using spmm_reduce_backward_input_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); +using spmm_reduce_backward_other_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); + +DECLARE_DISPATCH(spmm_reduce_fn, spmm_reduce_stub) +DECLARE_DISPATCH(spmm_reduce_arg_fn, spmm_reduce_arg_stub) +DECLARE_DISPATCH(spmm_reduce_backward_input_fn, spmm_reduce_backward_input_stub) +DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_input_arg_stub) +DECLARE_DISPATCH(spmm_reduce_backward_other_fn, spmm_reduce_backward_other_stub) +DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_other_arg_stub) + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/StackKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/StackKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f30015d64c7f299b65160453ea684d0b45107ce4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/StackKernel.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright 2004-present Facebook. All Rights Reserved. +#pragma once + +#include +#include + +namespace at::native { + +using stack_serial_fn = void(*)(Tensor &, TensorList, int64_t); +DECLARE_DISPATCH(stack_serial_fn, stack_serial_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h new file mode 100644 index 0000000000000000000000000000000000000000..5e30c19287a88e8e335b70798996946a6dd166ac --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h @@ -0,0 +1,1381 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* +The Python Imaging Library (PIL) is + + Copyright © 1997-2011 by Secret Labs AB + Copyright © 1995-2011 by Fredrik Lundh + +Pillow is the friendly PIL fork. It is + + Copyright © 2010-2022 by Alex Clark and contributors + +Like PIL, Pillow is licensed under the open source HPND License +*/ + +// This code is heavily inspired from PILLOW-SIMD's implementation: +// https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c + +#pragma once +#ifdef CPU_CAPABILITY_AVX2 +// TODO: This file only supports AVX2. We could split the AVX kernels into +// smaller logical blocks in order to port them into the Vec.h logic. This would +// allow to support other vectorization architectures and perhaps also support +// the non-vectorized fallback (we'd need to make sure it's not slower than the +// current fallback). + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + + +namespace { + +inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { + int32_t v; + if (i32_aligned) { + v = *(const int32_t*)ptr; + } else { + std::memcpy(&v, ptr, 4); + } + return _mm_cvtsi32_si128(v); +} + +inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { + return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned)); +} + +inline void _write_endline_rgb_as_uint32( + uint8_t* C10_RESTRICT output, + uint32_t data +) { + // data is (R G B X), output is (X1 X2 X3 | R1 B1 G1 R2 ...) + // Here we explicitly set X as R1 + uint8_t* data_ptr = reinterpret_cast(&data); + data_ptr[3] = output[3]; + std::memcpy(output, data_ptr, 4); +} + +at::Tensor unpack_rgb(const at::Tensor& packed_tensor) { + // Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into + // RGBARGBARGBA format where A is hard-coded to 0. Each pixel is encoded + // into as 32 bits. This generalizes to num_channels <= 4 and also works for + // non-channels_last tensors. + + const uint8_t* packed = (const uint8_t*)packed_tensor.const_data_ptr(); + auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2); + auto num_channels = packed_tensor.size(0); + + constexpr int rgba_size = 4; + auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte)); + uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr(); + + auto stride_i = packed_tensor.stride(2); + auto stride_j = packed_tensor.stride(0); + + for (const auto i : c10::irange(num_pixels)) { + for (const auto j : c10::irange(rgba_size)) { + unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0; + } + } + return unpacked_tensor; +} + +void pack_rgb( + const at::Tensor& unpacked_tensor, // IN + const at::Tensor& packed_tensor // OUT +) { + // Convert from unpacked channels last 3-channels or 4-channels tensor into original data layout. + + uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr(); + uint8_t* packed = (uint8_t*)packed_tensor.data_ptr(); + auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2); + auto num_channels = packed_tensor.size(0); + + auto unpacked_increment = unpacked_tensor.size(0); + auto packed_increment = packed_tensor.stride(2); + auto packed_stride = packed_tensor.stride(0); + + TORCH_INTERNAL_ASSERT(unpacked_increment == 3 || unpacked_increment == 4); + + for ([[maybe_unused]] const auto i : c10::irange(num_pixels)) { + for (const auto j : c10::irange(num_channels)) { + packed[j * packed_stride] = unpacked[j]; + } + unpacked += unpacked_increment; + packed += packed_increment; + } +} + +void ImagingResampleHorizontalConvolution8u4x( + uint8_t* C10_RESTRICT lineOut0, + uint8_t* C10_RESTRICT lineOut1, + uint8_t* C10_RESTRICT lineOut2, + uint8_t* C10_RESTRICT lineOut3, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn0, + const uint8_t* C10_RESTRICT lineIn1, + const uint8_t* C10_RESTRICT lineIn2, + const uint8_t* C10_RESTRICT lineIn3, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line); + +void ImagingResampleHorizontalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line); + +void ImagingResampleVerticalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + const uint8_t* C10_RESTRICT lineIn, + int64_t xsize, + int64_t ids_min, + int64_t ids_size, + const int16_t* k, + unsigned int coefs_precision, + int64_t num_channels); + +template +void ImagingResampleHorizontal( + const at::Tensor & unpacked_output, + const at::Tensor & unpacked_input, + int ksize, + const std::vector& horiz_indices_weights, + unsigned int horiz_weights_precision) { + + // Interpolation horizontal pass: we compute x-axis (image width) interpolation outputs. + + // Input data is stored as + // input = [r[0], g[0], b[0], a[0], r[1], g[1], b[1], a[1], r[2], g[2], b[2], a[2], ...] + // Weights are float values computed for each output pixel and rescaled to uint16: + // weights[i] = [w[i, 0], w[i, 1], ..., w[i, K-1]] + // We want to compute the output as following: + // output = [oR[0], oG[0], oB[0], oA[0], oR[1], oG[1], oB[1], oA[1], ...] + // where + // oR[yoffset + i] = r[yoffset + xmin[i]] * w[i, 0] + ... + r[yoffset + xmin[i] + K-1] * w[i, K-1] + // oG[yoffset + i] = g[yoffset + xmin[i]] * w[i, 0] + ... + g[yoffset + xmin[i] + K-1] * w[i, K-1] + // oB[yoffset + i] = b[yoffset + xmin[i]] * w[i, 0] + ... + b[yoffset + xmin[i] + K-1] * w[i, K-1] + // + + // TODO: we may want to merge that into the fallback code (currently called + // basic_loop_aa_horizontal) + // Although this may not be needed if / when we port all this code to use + // Vec.h since this would potentially give us another fall-back implem + + const int16_t* kk = (int16_t*)(horiz_indices_weights[3].const_data_ptr()); + + auto xout = unpacked_output.size(2); + auto yout = unpacked_output.size(1); + auto xin = unpacked_input.size(2); + TORCH_INTERNAL_ASSERT(num_channels == unpacked_input.size(0)); + + const int64_t* idx_ptr_xmin = horiz_indices_weights[0].const_data_ptr(); + const int64_t* idx_ptr_size = horiz_indices_weights[1].const_data_ptr(); + + uint8_t* unpacked_output_p = unpacked_output.data_ptr(); + const uint8_t* unpacked_input_p = unpacked_input.const_data_ptr(); + + int64_t yy = 0; + auto xout_stride = xout * num_channels; + auto xin_stride = xin * num_channels; + for (; yy < yout - 3; yy += 4) { + ImagingResampleHorizontalConvolution8u4x( + unpacked_output_p + yy * xout_stride, + unpacked_output_p + (yy + 1) * xout_stride, + unpacked_output_p + (yy + 2) * xout_stride, + unpacked_output_p + (yy + 3) * xout_stride, + xout, + unpacked_input_p + yy * xin_stride, + unpacked_input_p + (yy + 1) * xin_stride, + unpacked_input_p + (yy + 2) * xin_stride, + unpacked_input_p + (yy + 3) * xin_stride, + xin, + idx_ptr_xmin, + idx_ptr_size, + kk, + ksize, + horiz_weights_precision, + num_channels, + yy + 3 == yout - 1); + } + for (; yy < yout; yy++) { + ImagingResampleHorizontalConvolution8u( + unpacked_output_p + yy * xout_stride, + xout, + unpacked_input_p + yy * xin_stride, + xin, + idx_ptr_xmin, + idx_ptr_size, + kk, + ksize, + horiz_weights_precision, + num_channels, + yy == yout - 1); + } +} + +void ImagingResampleVertical( + const at::Tensor & unpacked_output, + const at::Tensor & unpacked_input, + int ksize, + const std::vector& vert_indices_weights, + unsigned int vert_weights_precision) { + + // Interpolation vertical pass: we compute y-axis interpolation outputs. + // Input data is stored as + // input = [r[0], g[0], b[0], a[0], r[1], g[1], b[1], a[1], r[2], g[2], b[2], a[2], ...] + // Weights are float values computed for each output pixel and rescaled to uint16: + // weights[i] = [w[i, 0], w[i, 1], ..., w[i, K-1]] + // We want to compute the output as following: + // output = [oR[0], oG[0], oB[0], oA[0], oR[1], oG[1], oB[1], oA[1], ...] + // where + // oR[xoffset + i] = r[xoffset + ymin[i]] * w[i, 0] + ... + r[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1] + // oG[xoffset + i] = g[xoffset + ymin[i]] * w[i, 0] + ... + g[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1] + // oB[xoffset + i] = b[xoffset + ymin[i]] * w[i, 0] + ... + b[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1] + + // TODO: we may want to merge that into the fallback code (currently called + // basic_loop_aa_vertical) + // Although this may not be needed if / when we port all this code to use + // Vec.h since this would potentially give us another fall-back implem + const int16_t* kk = (int16_t*)(vert_indices_weights[3].const_data_ptr()); + + const int64_t* idx_ptr_xmin = vert_indices_weights[0].const_data_ptr(); + const int64_t* idx_ptr_size = vert_indices_weights[1].const_data_ptr(); + + uint8_t* unpacked_output_p = unpacked_output.data_ptr(); + const uint8_t* unpacked_input_p = unpacked_input.const_data_ptr(); + + auto xout = unpacked_output.size(2); + auto yout = unpacked_output.size(1); + const auto num_channels = unpacked_input.size(0); + TORCH_INTERNAL_ASSERT(num_channels == unpacked_output.size(0)); + + auto xout_stride = xout * num_channels; + for (const auto yy : c10::irange(yout)) { + const auto* k = &kk[yy * ksize]; + auto ids_min = idx_ptr_xmin[yy]; + auto ids_size = idx_ptr_size[yy]; + ImagingResampleVerticalConvolution8u( + unpacked_output_p + yy * xout_stride, + unpacked_input_p, + xout, + ids_min, + ids_size, + k, + vert_weights_precision, + num_channels); + } +} + +// This is the only public entry point in this file. It supports bilinear or bicubic +// mode for uint8 dtype when C <= 4, with or without antialias. The +// implem is based on PIL-SIMD. +// Its equivalent implementation (fallback) for when AVX isn't supported or when +// C > 4 is separable_upsample_generic_Nd_kernel_impl() There are a bunch of +// future improvement that can be done: look for the TODOs in this file. +// For details on how the weights are computed and how the multiplications are +// run on int (instead of float weights), see +// [ Weights computation for uint8_t and multiplication trick ] +// For details on how the AVX kernels are implemented, see +// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5 +// See also [ Support for antialias=False as a subcase of antialias=True ] to +// learn more about how the antialias=False case is computed. The same holds +// here: all these kernels are general enough to handle an arbitrary number of +// weights, but when aa=False they could be optimized further. +template +void upsample_avx_bilinear_bicubic_uint8( + const at::Tensor& input_, + const at::Tensor& output, + bool align_corners, + const scale_type& scales, + bool antialias) { + auto batch_size = input_.size(0); + auto num_channels = input_.size(1); + auto xin = input_.size(3); + auto yin = input_.size(2); + auto xout = output.size(3); + auto yout = output.size(2); + + if (xin == xout && yin == yout) { + output.copy_(input_); + return; + } + + at::Tensor input = input_; + if (!(input.is_contiguous() || input.is_contiguous(at::MemoryFormat::ChannelsLast))) { + // If input is not contiguous with memory format channels first or channels last, + // we explicitly convert the input to contiguous channels last memory format. + // This simplifies the rest of the code and let us assume that the format is only contiguous channels first or channels last, + // Most tensors going through this `if` block won't need to go through unpacking, but those having C < 3 may + // have to (this means 2 copies are made). We could avoid the extra copy by handling non-contiguous input + // directly within unpack_rgb() and pack_rgb(), but initial attempts showed that this is fairly complex. + input = input.contiguous(at::MemoryFormat::ChannelsLast); + } + + auto need_horizontal = xout != xin; + auto need_vertical = yout != yin; + + int ksize_horiz, ksize_vert; + std::vector horiz_indices_weights, vert_indices_weights; + unsigned int horiz_weights_precision, vert_weights_precision; + + bool skip_unpacking = (num_channels == 3 || num_channels == 4) && input.is_contiguous(at::MemoryFormat::ChannelsLast); + bool skip_packing = (num_channels == 3 || num_channels == 4) && output.is_contiguous(at::MemoryFormat::ChannelsLast); + + if (need_horizontal) { + int interp_dim = 3; + auto stride = skip_unpacking ? num_channels : 4; + std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) = + F::compute_index_ranges_int16_weights( + /*input_size=*/xin, + /*output_size=*/xout, + /*stride=*/stride, + /*ndims=*/4, + /*reshape_dim=*/interp_dim, + /*align_corners=*/align_corners, + /*opt_scale=*/scales[interp_dim - 2], + /*antialias=*/antialias, + /*align_i32=*/true); + } + + if (need_vertical) { + int interp_dim = 2; + auto stride = skip_unpacking ? num_channels * xout : 4 * xout; + std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) = + F::compute_index_ranges_int16_weights( + /*input_size=*/yin, + /*output_size=*/yout, + /*stride=*/stride, + /*ndims=*/4, + /*reshape_dim=*/interp_dim, + /*align_corners=*/align_corners, + /*opt_scale=*/scales[interp_dim - 2], + /*antialias=*/antialias, + /*align_i32=*/true); + } + + at::Tensor buffer_horiz, buffer_vert; + // Minor optimization: we can avoid allocating an extra buffer if we're performing + // horizontal-only or vertical-only interpolation, and if the tensor doesn't + // need repacking + if (need_horizontal && (need_vertical || !skip_packing)) { + auto c = skip_unpacking ? num_channels : 4; + buffer_horiz = at::empty({c, yin, xout}, input.options()); + } + if (need_vertical && !skip_packing) { + auto c = skip_unpacking ? num_channels : 4; + buffer_vert = at::empty({c, yout, xout}, input.options()); + } + + for (const auto i : c10::irange(batch_size)) { + + at::Tensor unpacked_input = skip_unpacking ? input[i] : unpack_rgb(input[i]); + at::Tensor unpacked_output; + + if (need_horizontal) { + at::Tensor unpacked_output_temp = (need_vertical || !skip_packing) ? buffer_horiz : output[i]; + + if (skip_unpacking && num_channels == 3) { + ImagingResampleHorizontal<3>( + unpacked_output_temp, + unpacked_input, + ksize_horiz, + horiz_indices_weights, + horiz_weights_precision); + } else { + ImagingResampleHorizontal<4>( + unpacked_output_temp, + unpacked_input, + ksize_horiz, + horiz_indices_weights, + horiz_weights_precision); + } + unpacked_output = unpacked_input = unpacked_output_temp; + } + if (need_vertical) { + unpacked_output = skip_packing ? output[i] : buffer_vert; + + ImagingResampleVertical( + unpacked_output, + unpacked_input, + ksize_vert, + vert_indices_weights, + vert_weights_precision + ); + } + + TORCH_INTERNAL_ASSERT(unpacked_output.defined()); + + if (!skip_packing) { + pack_rgb(unpacked_output, output[i]); + } + } +} + +void ImagingResampleHorizontalConvolution8u4x( + uint8_t* C10_RESTRICT lineOut0, + uint8_t* C10_RESTRICT lineOut1, + uint8_t* C10_RESTRICT lineOut2, + uint8_t* C10_RESTRICT lineOut3, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn0, + const uint8_t* C10_RESTRICT lineIn1, + const uint8_t* C10_RESTRICT lineIn2, + const uint8_t* C10_RESTRICT lineIn3, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line) { + + // Interpolation horizontal pass processing together 4 vertical lines. + // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA + // we can encode 4 values as a single uint32 value. + // - We split the size of weight vector for a given output index as a sum: + // ids_size = num_blocks_4 * 4 + num_blocks_2 * 2 + num_blocks_1. + // - We load and process 4 weights values in a loop ("block 4") then we process 2 weights values + // in another loop ("block 2") and finally we process 1 weights value in the final loop ("block 1"). + + // Define shuffling masks (low/high) for num_channels 4 and 3 + // Mask low casts lower half of each lane to epi16 and reorder RGBARGBA -> RRGGBBAA: + // [r1 g1 b1 a1 r2 g2 b2 a2 ... | R1 G1 B1 A1 R2 G2 B2 A2 ... ] -> + // [r1 0 r2 0 g1 0 g2 0 b1 0 b2 0 a1 0 a2 0 | R1 0 R2 0 G1 0 G2 0 B1 0 B2 0 A1 0 A2 0] + // Mask high casts upper half of each lane to epi16 and reorder RGBARGBA -> RRGGBBAA:: + // [ ... r3 g3 b3 a3 r4 g4 b4 a4 | ... R3 G3 B3 A3 R4 G4 B4 A4 ] -> + // [r3 0 r4 0 g3 0 g4 0 b3 0 b4 0 a3 0 a4 0 | R3 0 R4 0 G3 0 G4 0 B3 0 B4 0 A3 0 A4 0] + + const auto mask_low_c4 = _mm256_set_epi8( + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0, + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + const auto mask_high_c4 = _mm256_set_epi8( + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8, + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8); + const auto mask_low_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0, + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_high_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6, + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6); + + const auto mask_low = (num_channels == 3) ? mask_low_c3 : mask_low_c4; + const auto mask_high = (num_channels == 3) ? mask_high_c3 : mask_high_c4; + + const auto stride = num_channels * sizeof(uint8_t); + + TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4); + + // out_xsize = output width, out_x = output x index + // ids_min is the input offset index corresponding to out_x + // ids_size is the interpolation size for out_x + + // Let's precompute ids_size limits for block 4 and block 2. + // + // In block 4 (4 means we process 4 weight values together), we read input data + // with _mm_loadu_si128, i.e. 16 bytes, per one line: + // lineIn0 + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 16.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft + // RGBA: b4_delta = b4_delta_soft = 3 + // RGB : b4_delta = 5 + // RGB : b4_delta_soft = 4 + const auto b4_delta = (stride == 4) ? 3 : (is_last_line ? 5 : 4); + + // In block 2 (2 means we process 2 weights values together), we read input data + // with _mm_loadl_epi64, i.e. 8 bytes, per one line: + // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 8.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft + // RGBA: b2_delta = b2_delta_soft = 1 + // RGB : b2_delta = 2 + // RGB : b2_delta_soft = 1 + const auto b2_delta = (stride == 4) ? 1 : (is_last_line ? 2 : 1); + + const auto max_out_x_strided = out_xsize * stride; + const auto max_in_x_strided = in_xsize * stride; + + const auto zero = _mm256_setzero_si256(); + const auto initial = _mm256_set1_epi32(1 << (coefs_precision - 1)); + + for (const auto out_x : c10::irange(out_xsize)) { + const auto ids_min = idx_ptr_xmin[out_x]; + const auto ids_size = idx_ptr_size[out_x]; + const auto * k = &kk[out_x * kmax]; + int64_t i = 0; + + auto sss0 = initial; + auto sss1 = initial; + + const auto * lineIn0_min = lineIn0 + ids_min; + const auto * lineIn1_min = lineIn1 + ids_min; + const auto * lineIn2_min = lineIn2 + ids_min; + const auto * lineIn3_min = lineIn3 + ids_min; + + // block 4 + for (; i < ids_size - b4_delta; i += 4) { + // Load 4 values from weight vector + // mmk0 = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ...] + // mmk1 = [wl_2 wh_2 wl_3 wh_3 wl_2 wh_2 wl_3 wh_3 ...] + const auto mmk0 = _mm256_set1_epi32(*(int32_t*)&k[i]); + const auto mmk1 = _mm256_set1_epi32(*(int32_t*)&k[i + 2]); + + // RGBA: Load 8 pixels (4 per line) from input lines 0 and 1: + // source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // R0 G0 B0 A0 R1 G1 B1 A1 R2 G2 B2 A2 R3 G3 B3 A3 + // ] + // RGB: Load 10 pixels (5 per line) + // source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + // ] + auto source = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) (lineIn0_min + stride * i))), + _mm_loadu_si128((__m128i *) (lineIn1_min + stride * i)), 1); + + // Apply mask_low: + // RGBA: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 A0 0 A1 0] + // RGB: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 0 0 0 0] + auto pix1 = _mm256_shuffle_epi8(source, mask_low); + // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk0)); + + // Apply mask_high: + // RGBA: + // [r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 a2 0 a3 0 | R2 0 R3 0 G2 0 G3 0 B2 0 B3 0 A2 0 A3 0] + // RGB: + // [r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 0 0 0 0 | R2 0 R3 0 G2 0 G3 0 B2 0 B3 0 0 0 0 0] + auto pix2 = _mm256_shuffle_epi8(source, mask_high); + // Compute output value as C += w2 * C2 + w3 * C3 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix2, mmk1)); + + // Same as above to next lines 2 and 3: + auto source2 = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) (lineIn2_min + stride * i))), + _mm_loadu_si128((__m128i *) (lineIn3_min + stride * i)), 1); + auto pix3 = _mm256_shuffle_epi8(source2, mask_low); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix3, mmk0)); + auto pix4 = _mm256_shuffle_epi8(source2, mask_high); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix4, mmk1)); + } + + // block 2 + for (; i < ids_size - b2_delta; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ...] + const auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]); + + // Load 4 pixels (2 per line) from input lines 0 and 1: + // RGBA: source1 = [ + // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0 + // R0 G0 B0 A0 R1 G1 B1 A1 0 0 0 0 0 0 0 0 + // ] + // RGB: source1 = [ + // r0 g0 b0 r1 g1 b1 r2 0 0 0 0 0 0 0 0 + // R0 G0 B0 R1 G1 B1 R2 0 0 0 0 0 0 0 0 + // ] + auto source1 = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *) (lineIn0_min + stride * i))), + _mm_loadl_epi64((__m128i *) (lineIn1_min + stride * i)), 1); + // Apply mask_low: + // RGBA: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 A0 0 A1 0] + // RGB: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 0 0 0 0] + auto pix1 = _mm256_shuffle_epi8(source1, mask_low); + // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + + // Same as above for lines 2 and 3: + auto source2 = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *) (lineIn2_min + stride * i))), + _mm_loadl_epi64((__m128i *) (lineIn3_min + stride * i)), 1); + auto pix2 = _mm256_shuffle_epi8(source2, mask_low); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + } + + // block 1 + const auto i32_aligned = num_channels == 4; + for (; i < ids_size - 1; i++) { + // Load 1 value from weight vector + // mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...] + const auto mmk = _mm256_set1_epi32(k[i]); + + // Load 2 pixels (one per line) from input lines 0 and 1: + // RGBA: pix1 = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 a0 0 0 0 + // R0 0 0 0 G0 0 0 0 B0 0 0 0 A0 0 0 0 + // ] + // RGB: pix1 = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0 + // R0 0 0 0 G0 0 0 0 B0 0 0 0 R1 0 0 0 + // ] + auto pix1 = _mm256_inserti128_si256(_mm256_castsi128_si256( + mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)), + mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1); + // Compute output value as C += w0 * C0 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + + // Same as above for lines 2 and 3 + auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256( + mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned)), + mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned), 1); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + } + + if (i == ids_size - 1) { + // last element + auto mmk = _mm256_set1_epi32(k[i]); + // For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes + // lines 0, 1 and 2 won't go out of allocated memory bounds + auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256( + mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)), + mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + + auto p0 = mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned); + __m128i p1; + if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) { + uint8_t input[4]; + std::memcpy(input, lineIn3_min + stride * i, 3); + p1 = mm_cvtepu8_epi32(input, true); + } else { + p1 = mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned); + } + auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256(p0), p1, 1); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + } + + // Convert fixed point values back to integers (truncating) + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d 0 0 0 0 0 0 0 0) + sss0 = _mm256_packs_epi32(sss0, zero); + sss1 = _mm256_packs_epi32(sss1, zero); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d 0 0 0 0) + sss0 = _mm256_packus_epi16(sss0, zero); + sss1 = _mm256_packus_epi16(sss1, zero); + + // Write the output into single uint32 + // (a b c d) -> x_uint32 + auto o0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sss0)); + auto o1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1)); + auto o2 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sss1)); + auto o3 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1)); + + const auto out_x_strided = stride * out_x; + + if (num_channels == 3 && C10_UNLIKELY(out_x_strided + 4 >= max_out_x_strided)) { + // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write + // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1). + // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct + // value which was previously computed by another line. In other words, it means that we can not overwrite + // it by simply writing 4 bytes from the register to the output. We'll do the following: + // v----------| + // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...] + // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1) + // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1) + // Output = [... R G B | R1 G1 B1 R2 ...] + + _write_endline_rgb_as_uint32(lineOut0 + out_x_strided, o0); + _write_endline_rgb_as_uint32(lineOut1 + out_x_strided, o1); + _write_endline_rgb_as_uint32(lineOut2 + out_x_strided, o2); + + if (C10_UNLIKELY(is_last_line)) { + // When we handle the last line, we can not access the next 4 bytes + // as they are out of memory bounds. + std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, num_channels); + } else { + _write_endline_rgb_as_uint32(lineOut3 + out_x_strided, o3); + } + } else if (num_channels == 3) { + // Memcpy 4-bytes is faster than 3-bytes and here + // we simply write 4 bytes (... R G B X 0 0 0 0 0 ...) where X is a garbage value + // that we will overwrite on the next iteration: (... R G B R G B X 0 0 ...) + std::memcpy(lineOut0 + out_x_strided, (uint8_t *) &o0, 4); + std::memcpy(lineOut1 + out_x_strided, (uint8_t *) &o1, 4); + std::memcpy(lineOut2 + out_x_strided, (uint8_t *) &o2, 4); + std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, 4); + } else { + // num_channels = 4 -> lineOutX + out_x_strided should be uint32 aligned + *(uint32_t *)(lineOut0 + out_x_strided) = o0; + *(uint32_t *)(lineOut1 + out_x_strided) = o1; + *(uint32_t *)(lineOut2 + out_x_strided) = o2; + *(uint32_t *)(lineOut3 + out_x_strided) = o3; + } + } +} + +void ImagingResampleHorizontalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line) { + + // Interpolation horizontal pass processing only one vertical line. + // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA + // we can encode 4 values as a single uint32 value. + // - We split the size of weight vector for a given output index as a sum: + // ids_size = num_blocks_8 * 8 + num_blocks_4 * 4 + num_blocks_2 * 2 + num_blocks_1 + // - We load and process 8 weights values in a loop ("block 8") then 4 weights and 2 weights values in + // in another loops ("block 4" and "block 2") and finally we process 1 weight value in the final loop ("block 1"). + + // Define various shuffling masks + const auto kmask_low = _mm256_set_epi8( + 11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8, + 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + const auto kmask_high = _mm256_set_epi8( + 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, + 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4); + const auto kmask_hl = _mm256_set_epi8( + 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, + 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + + const auto mask_low_c4 = _mm256_set_epi8( + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0, + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + const auto mask_high_c4 = _mm256_set_epi8( + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8, + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8); + const auto mask_low_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0, + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_high_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6, + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6); + const auto mask_hl_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6, + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_hl_c4 = _mm256_set_epi8( + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8, + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + + const auto mask_low128_c3 = _mm_set_epi8( + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_low128_c4 = _mm_set_epi8( + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + + const auto mask_low = (num_channels == 3) ? mask_low_c3 : mask_low_c4; + const auto mask_high = (num_channels == 3) ? mask_high_c3 : mask_high_c4; + const auto mask_hl = (num_channels == 3) ? mask_hl_c3 : mask_hl_c4; + const auto mask_low128 = (num_channels == 3) ? mask_low128_c3 : mask_low128_c4; + + // out_xsize = output width, out_x = output x index + // ids_min is the input offset index corresponding to out_x + // ids_size is the interpolation size for out_x + + const auto stride = num_channels * sizeof(uint8_t); + const auto zero = _mm_setzero_si128(); + + TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4); + + // Let's precompute ids_size limits for block 8, block 4 and block 2 + // + // In block 8 (8 means we process 8 weight values together), we read at + // most 32 bytes input data (16 + 16 bytes for RGBA and 12 + 16 bytes for RGB) + // lineIn + stride * (i + ids_min) + 32 <= lineIn + stride * (ids_size + ids_min) + // --> i <= ids_size - 32.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(32.0 / stride)) = ids_size - b8_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(32.0 / stride) = ids_size - b8_delta_soft + // RGBA: b8_delta = b8_delta_soft = 7 + // RGB : b8_delta = 10 + // RGB : b8_delta_soft = 9 + const auto b8_delta = (stride == 4) ? 7 : (is_last_line ? 10 : 9); + + // In block 4 (4 means we process 4 weight values together), we read + // 16 bytes of input data. + // lineIn + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 16.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft + // RGBA: b4_delta = b4_delta_soft = 3 + // RGB : b4_delta = 5 + // RGB : b4_delta_soft = 4 + const auto b4_delta = (stride == 4) ? 3 : (is_last_line ? 5 : 4); + + // In block 2 (2 means we process 2 weight values together), we read + // 8 bytes of input data. + // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 8.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft + // RGBA: b2_delta = b2_delta_soft = 1 + // RGB : b2_delta = 2 + // RGB : b2_delta_soft = 1 + const auto b2_delta = (stride == 4) ? 1 : (is_last_line ? 2 : 1); + + const auto max_out_x_strided = out_xsize * stride; + const auto max_in_x_strided = in_xsize * stride; + + for (const auto out_x : c10::irange(out_xsize)) { + __m128i sss; + const auto ids_min = idx_ptr_xmin[out_x]; + const auto ids_size = idx_ptr_size[out_x]; + const auto * k = &kk[out_x * kmax]; + int64_t i = 0; + + const auto * lineIn_min = lineIn + ids_min; + + if (ids_size < 8) { + sss = _mm_set1_epi32(1 << (coefs_precision - 1)); + } else { + // Lower part will be added to higher, use only half of the error + auto sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2)); + + // block 8 + for (; i < ids_size - b8_delta; i += 8) { + // Load 8 values from weight vector + auto tmp = _mm_loadu_si128((__m128i*)&k[i]); + // ksource = [ + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 wl_4 wh_4 wl_5 wh_5 wl_6 wh_6 wl_7 wh_7 + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 wl_4 wh_4 wl_5 wh_5 wl_6 wh_6 wl_7 wh_7 + // ] + auto ksource = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // RGBA: Load 8 pixels from input: + // source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // r4 g4 b4 a4 r5 g5 b5 a5 r6 g6 b6 a6 r7 g7 b7 a7 + // ] + // RGB: Load 10 pixels from input (however we can process only 8 pixels): + // source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // r4 g4 b4 r5 g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 + // ] + auto source = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) (lineIn_min + stride * i))), + _mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1); + + // Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA + // RGBA: pix1 = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 + // r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0 + // ] + // RGB: pix1 = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 + // r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 0 0 0 0 + // ] + auto pix1 = _mm256_shuffle_epi8(source, mask_low); + // mmk1 = [ + // wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ... + // wl_4 wh_4 wl_5 wh_5 wl_4 wh_4 wl_5 wh_5 ... ... + // ] + auto mmk1 = _mm256_shuffle_epi8(ksource, kmask_low); + // Compute output value as + // C += w0 * C0 + w1 * C1 + // C += w4 * C4 + w5 * C5 for each channel in 32-bit precision + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix1, mmk1)); + + // Same as above for higher part of each lane + auto pix2 = _mm256_shuffle_epi8(source, mask_high); + auto mmk2 = _mm256_shuffle_epi8(ksource, kmask_high); + // Compute output value as + // C += w2 * C2 + w3 * C3 + // C += w6 * C6 + w7 * C7 for each channel in 32-bit precision + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix2, mmk2)); + } + + // block 4 + for (; i < ids_size - b4_delta; i += 4) { + // Load 4 values from weight vector + auto tmp = _mm_loadl_epi64((__m128i *) &k[i]); + // ksource = [ + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 0 0 0 0 0 0 0 0 + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 0 0 0 0 0 0 0 0 + // ] + auto ksource = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // Load pixels from input line + tmp = _mm_loadu_si128((__m128i *) (lineIn_min + stride * i)); + // RGBA: source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // ] + // RGB: source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // ] + auto source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // Cast source to epi16 and reorder RGBARGBA -> RRGGBBAA + // RGBA: pix = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 + // r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 a2 0 a3 0 + // ] + // RGB: pix = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 + // r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 0 0 0 0 + // ] + auto pix = _mm256_shuffle_epi8(source, mask_hl); + // mmk = [ + // wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ... + // wl_2 wh_2 wl_3 wh_3 wl_2 wh_2 wl_3 wh_3 ... ... + // ] + auto mmk = _mm256_shuffle_epi8(ksource, kmask_hl); + // Compute output value as + // C += w0 * C0 + w1 * C1 + // C += w2 * C2 + w3 * C3 for each channel in 32-bit precision + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk)); + } + + // Sum results between the lanes + sss = _mm_add_epi32( + _mm256_extracti128_si256(sss256, 0), + _mm256_extracti128_si256(sss256, 1)); + } + + // block 2 + for (; i < ids_size - b2_delta; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ...] + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + // Load pixels from input line + // RGBA: source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0 + // ] + // RGB: source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 0 0 0 0 0 0 0 0 + // ] + auto source = _mm_loadl_epi64((__m128i *) (lineIn_min + stride * i)); + // Cast source to epi16 and reorder RGBARGBA -> RRGGBBAA + auto pix = _mm_shuffle_epi8(source, mask_low128); + // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // block 1 + const auto i32_aligned = num_channels == 4; + for (; i < ids_size - 1; i++) { + // Load 1 value from weight vector + // mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...] + auto mmk = _mm_set1_epi32(k[i]); + // Load one pixel from input line + // RGBA: pix = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 a0 0 0 0 + // ] + // RGB: pix = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0 + // ] + auto pix = mm_cvtepu8_epi32(lineIn_min + stride * i, i32_aligned); + // Compute output value as C += w0 * C0 for each channel in 32-bit precision + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + if (i == ids_size - 1) { + // last element + auto mmk = _mm_set1_epi32(k[i]); + __m128i pix; + auto p = lineIn_min + stride * i; + if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) { + uint8_t input[4]; + std::memcpy(input, p, 3); + pix = mm_cvtepu8_epi32(input, true); + } else { + pix = mm_cvtepu8_epi32(p, i32_aligned); + } + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // Convert fixed point values back to integers (truncating) + sss = _mm_srai_epi32(sss, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d 0 0 0 0 0 0 0 0) + sss = _mm_packs_epi32(sss, zero); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d 0 0 0 0) + sss = _mm_packus_epi16(sss, zero); + // Write the output into single uint32 + // (a b c d) -> x_uint32 + auto o = _mm_cvtsi128_si32(sss); + const auto out_x_strided = stride * out_x; + if (num_channels == 3 && C10_UNLIKELY(out_x_strided + 4 >= max_out_x_strided)) { + if (C10_UNLIKELY(is_last_line)) { + // When we handle the last line, we can not access the next 4 bytes + // as they are out of memory bounds. + std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 3); + } else { + // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write + // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1). + // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct + // value which was previously computed by another line. In other words, it means that we can not overwrite + // it by simply writing 4 bytes from the register to the output. We'll do the following: + // v----------| + // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...] + // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1) + // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1) + // Output = [... R G B | R1 G1 B1 R2 ...] + _write_endline_rgb_as_uint32(lineOut + out_x_strided, o); + } + } else if (num_channels == 3) { + // Memcpy 4-bytes is faster than 3-bytes and here + // we simply write 4 bytes (... R G B X 0 0 0 0 0 ...) where X is a garbage value + // that we will overwrite on the next iteration: (... R G B R G B X 0 0 ...) + std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 4); + } else { + // num_channels = 4 -> lineOut + out_x_strided should be uint32 aligned + *(uint32_t *)(lineOut + out_x_strided) = o; + } + } +} + +void ImagingResampleVerticalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + const uint8_t* C10_RESTRICT lineIn, + int64_t xsize, + int64_t ids_min, + int64_t ids_size, + const int16_t* k, + unsigned int coefs_precision, + int64_t num_channels) { + + // Interpolation vertical pass processing one line. + // - We process x-axis data with blocks of 8, 2 and 1 + // - We split the size of weight vector for a given output index as a sum: K = n * 2 + m. + + // xsize = output width, also equals to input width + // ids_size = interpolation size + // ids_min = input y start index + const auto stride = num_channels * sizeof(uint8_t); + + TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4); + + const int64_t data_size = xsize * stride; + const int64_t data_stride = stride; + constexpr auto vec_size = 256 / 8; + + const auto initial = _mm_set1_epi32(1 << (coefs_precision - 1)); + const auto initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1)); + const auto zero = _mm_setzero_si128(); + const auto zero_256 = _mm256_setzero_si256(); + + int64_t j = 0; + // block 8 + const auto b8_usable_vec_stride = (vec_size / data_stride) * data_stride; + for (; j < data_size - vec_size; j += b8_usable_vec_stride) { + auto sss0 = initial_256; + auto sss1 = initial_256; + auto sss2 = initial_256; + auto sss3 = initial_256; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + + for (; i < ids_size - 1; i += 2) { + // Load 2 values from weight vector + auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]); + + // RGBA: Load 8 pixels per line + // source1 = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // r4 g4 b4 a4 r5 g5 b5 a5 r6 g6 b6 a6 r7 g7 b7 a7 + // ] + // RGB: Load 10 pixels per line (however we can process only 8 pixels): + // source1 = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // r4 g4 b4 r5 g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 + // ] + auto source1 = + _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * i)); + auto source2 = + _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * (i + 1))); + + // Interleave source1 and source2 from the low half of each 128-bit lane + // and cast the result to epi16 + // RGBA: pix1 = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 a0 0 A0 0 + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 a1 0 A1 0 + // ] + // RGB: pix1 = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 0 0 0 0 + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 0 0 0 0 + // ] + auto source_lo = _mm256_unpacklo_epi8(source1, source2); + auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256); + // Compute output value as + // C += w0 * c0 + w1 * C0 + // C += w0 * c1 + w1 * C1 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + + // RGBA: pix2 = [ + // r2 0 R2 0 g2 0 G2 0 b2 0 B2 0 a2 0 A2 0 + // r3 0 R3 0 g3 0 G3 0 b3 0 B3 0 a3 0 A3 0 + // ] + // RGB: pix2 = [ + // r2 0 R2 0 g2 0 G2 0 b2 0 B2 0 0 0 0 0 + // r3 0 R3 0 g3 0 G3 0 b3 0 B3 0 0 0 0 0 + // ] + auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256); + // Compute output value as + // C += w0 * c2 + w1 * C2 + // C += w0 * c3 + w1 * C3 for each channel in 32-bit precision + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + + // Same as above for the high half of each 128-bit lane + auto source_hi = _mm256_unpackhi_epi8(source1, source2); + auto pix3 = _mm256_unpacklo_epi8(source_hi, zero_256); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk)); + auto pix4 = _mm256_unpackhi_epi8(source_hi, zero_256); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk)); + } + // Same processing as above but with a single weight value + for (; i < ids_size; i += 1) { + auto mmk = _mm256_set1_epi32(k[i]); + + auto source1 = _mm256_loadu_si256((__m256i*)(lineIn_min + i * data_size)); + + auto source_lo = _mm256_unpacklo_epi8(source1, zero_256); + auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + + auto source_hi = _mm256_unpackhi_epi8(source1, zero_256); + auto pix3 = _mm256_unpacklo_epi8(source_hi, _mm256_setzero_si256()); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk)); + auto pix4 = _mm256_unpackhi_epi8(source_hi, _mm256_setzero_si256()); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk)); + } + // Convert fixed point values back to integers (truncating) + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + sss2 = _mm256_srai_epi32(sss2, coefs_precision); + sss3 = _mm256_srai_epi32(sss3, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d) + sss0 = _mm256_packs_epi32(sss0, sss1); + sss2 = _mm256_packs_epi32(sss2, sss3); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d) + sss0 = _mm256_packus_epi16(sss0, sss2); + + // Stores 32 bytes + _mm256_storeu_si256((__m256i*)(lineOut + j), sss0); + } + + // TODO: Do we also need block 4 ??? + // block 2 + const auto b2_usable_vec_stride = (8 / data_stride) * data_stride; + for (; j < data_size - vec_size / 4; j += b2_usable_vec_stride) { + auto sss0 = initial; + auto sss1 = initial; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + + for (; i < ids_size - 1; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ] + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + + // Load 2 pixels per line + // RGBA: source1 = [ + // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0 + // ] + // RGB: source1 = [ + // r0 g0 b0 r1 g1 b1 r2 g2 0 0 0 0 0 0 0 0 + // ] + auto source1 = _mm_loadl_epi64((__m128i *) (lineIn_min + i * data_size)); + auto source2 = _mm_loadl_epi64((__m128i *) (lineIn_min + (i + 1) * data_size)); + // Interleave source1 and source2 and cast the result to epi16 + // RGBA: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 a0 0 A0 0 + // ] + // RGB: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 0 0 0 0 + // ] + auto source = _mm_unpacklo_epi8(source1, source2); + auto pix = _mm_unpacklo_epi8(source, zero); + // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + // RGBA: pix = [ + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 a1 0 A1 0 + // ] + // RGB: pix = [ + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 0 0 0 0 + // ] + pix = _mm_unpackhi_epi8(source, zero); + // Compute output value as C += w0 * c1 + w1 * C1 for each channel in 32-bit precision + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + } + // Same processing as above but with a single weight value + for (; i < ids_size; i += 1) { + auto mmk = _mm_set1_epi32(k[i]); + + auto source1 = _mm_loadl_epi64((__m128i*) (lineIn_min + i * data_size)); + + auto source = _mm_unpacklo_epi8(source1, zero); + auto pix1 = _mm_unpacklo_epi8(source, zero); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix1, mmk)); + auto pix2 = _mm_unpackhi_epi8(source, zero); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix2, mmk)); + } + // Convert fixed point values back to integers (truncating) + sss0 = _mm_srai_epi32(sss0, coefs_precision); + sss1 = _mm_srai_epi32(sss1, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d) + sss0 = _mm_packs_epi32(sss0, sss1); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d) + sss0 = _mm_packus_epi16(sss0, sss0); + // Store 2 pixels to the output + _mm_storel_epi64((__m128i*)(lineOut + j), sss0); + } + + // block 1 + const auto b1_usable_vec_stride = (4 / data_stride) * data_stride; + const auto i32_aligned = num_channels == 4; + for (; j < data_size - 4; j += b1_usable_vec_stride) { + auto sss = initial; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + + for (; i < ids_size - 1; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ] + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + + // Load one pixel per line + // RGBA: source1 = [ + // r0 g0 b0 a0 0 0 0 0 0 0 0 0 0 0 0 0 + // ] + // RGB: source1 = [ + // r0 g0 b0 r1 0 0 0 0 0 0 0 0 0 0 0 0 + // ] + auto source1 = mm_cvtsi32_si128(lineIn_min + i * data_size, i32_aligned); + auto source2 = mm_cvtsi32_si128(lineIn_min + (i + 1) * data_size, i32_aligned); + + // Interleave source1 and source2 and cast the result to epi16 + // RGBA: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 a0 0 A0 0 + // ] + // RGB: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 0 0 0 0 + // ] + auto source = _mm_unpacklo_epi8(source1, source2); + auto pix = _mm_unpacklo_epi8(source, zero); + // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + for (; i < ids_size; i++) { + auto mmk = _mm_set1_epi32(k[i]); + auto pix = mm_cvtepu8_epi32(lineIn_min + i * data_size, i32_aligned); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + sss = _mm_srai_epi32(sss, coefs_precision); + sss = _mm_packs_epi32(sss, zero); + sss = _mm_packus_epi16(sss, zero); + + auto o = _mm_cvtsi128_si32(sss); + + // Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3 + // It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data. + // We also won't go out of bounds of lineOut memory allocation + std::memcpy(lineOut + j, (uint8_t *) &o, 4); + } + + for (; j < data_size; j += data_stride) { + auto sss = initial; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + // For RGBA we can use (ids_size - 1) as tighter limit but for RGB we can read outside memory boundary + // for the last remaining line + for (; i < ids_size - 2; i += 2) { + // Load two coefficients at once + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + + // Load 2 lines + auto source1 = mm_cvtsi32_si128(lineIn_min + i * data_size, i32_aligned); + auto source2 = mm_cvtsi32_si128(lineIn_min + (i + 1) * data_size, i32_aligned); + + auto source = _mm_unpacklo_epi8(source1, source2); + auto pix = _mm_unpacklo_epi8(source, zero); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // Same processing as above but with a single weight value + for (; i < ids_size; i++) { + auto mmk = _mm_set1_epi32(k[i]); + + const uint8_t * p = lineIn_min + i * data_size; + __m128i pix; + // There is no much perf gain using more detailed condition like + // num_channels == 3 && ids_min + j + data_size * i + 4 >= in_max_size + // const int64_t in_max_size = data_size * in_ysize; + if (num_channels == 3) { + uint8_t input[4]; + std::memcpy(input, p, 3); + pix = mm_cvtepu8_epi32(input, true); + } else { + pix = mm_cvtepu8_epi32(p, true); + } + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // Convert fixed point values back to integers (truncating) + sss = _mm_srai_epi32(sss, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d) + sss = _mm_packs_epi32(sss, zero); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d) + sss = _mm_packus_epi16(sss, zero); + // Store one pixel to the output + auto o = _mm_cvtsi128_si32(sss); + if (num_channels == 3 && C10_UNLIKELY(j + 4 >= data_size)) { + std::memcpy(lineOut + j, (uint8_t *) &o, 3); + } else { + std::memcpy(lineOut + j, (uint8_t *) &o, 4); + } + } +} + +} // anonymous namespace +#endif // CPU_CAPABILITY_AVX2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..a2b3bf4061ffc690c1eb1b6a57577510403f1eeb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +using weight_norm_fn = void(*)( + TensorBase&, TensorBase&, const TensorBase&, const TensorBase&, int64_t); +using weight_norm_backward_fn = void(*)( + TensorBase&, TensorBase&, const TensorBase&, const TensorBase&, + const TensorBase&, const TensorBase&, int64_t); + +DECLARE_DISPATCH(weight_norm_fn, weight_norm_stub) +DECLARE_DISPATCH(weight_norm_backward_fn, weight_norm_backward_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/avx_mathfun.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/avx_mathfun.h new file mode 100644 index 0000000000000000000000000000000000000000..73e827054b4830bfa4154a511aa4feca133cd391 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/avx_mathfun.h @@ -0,0 +1,527 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +/* + AVX implementation of sin, cos, sincos, exp and log + + Based on "sse_mathfun.h", by Julien Pommier + http://gruntthepeon.free.fr/ssemath/ + + Copyright (C) 2012 Giovanni Garberoglio + Interdisciplinary Laboratory for Computational Science (LISC) + Fondazione Bruno Kessler and University of Trento + via Sommarive, 18 + I-38123 Trento (Italy) + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#include + +/* The original source of this file has been modified. */ +#if defined(CPU_CAPABILITY_AVX2) + +#if defined(__GNUC__) +# define ALIGN32_BEG __attribute__((aligned(32))) +#elif defined(_WIN32) +# define ALIGN32_BEG __declspec(align(32)) +#endif + +typedef __m256 v8sf; // vector of 8 float (avx2) +typedef __m256i v8si; // vector of 8 int (avx2) + +/* declare some AVX constants -- why can't I figure a better way to do that? */ +#define _PS256_CONST(Name, Val) \ + static const ALIGN32_BEG float _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } +#define _PI32_CONST256(Name, Val) \ + static const ALIGN32_BEG int _pi32_256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } +#define _PS256_CONST_TYPE(Name, Type, Val) \ + static const ALIGN32_BEG Type _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } + +_PS256_CONST(1 , 1.0f); +_PS256_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST256(0, 0); +_PI32_CONST256(1, 1); +_PI32_CONST256(inv1, ~1); +_PI32_CONST256(2, 2); +_PI32_CONST256(4, 4); +_PI32_CONST256(0x7f, 0x7f); + +_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS256_CONST(cephes_log_p0, 7.0376836292E-2); +_PS256_CONST(cephes_log_p1, - 1.1514610310E-1); +_PS256_CONST(cephes_log_p2, 1.1676998740E-1); +_PS256_CONST(cephes_log_p3, - 1.2420140846E-1); +_PS256_CONST(cephes_log_p4, + 1.4249322787E-1); +_PS256_CONST(cephes_log_p5, - 1.6668057665E-1); +_PS256_CONST(cephes_log_p6, + 2.0000714765E-1); +_PS256_CONST(cephes_log_p7, - 2.4999993993E-1); +_PS256_CONST(cephes_log_p8, + 3.3333331174E-1); +_PS256_CONST(cephes_log_q1, -2.12194440e-4); +_PS256_CONST(cephes_log_q2, 0.693359375); + + +/* natural logarithm computed for 8 simultaneous float + return NaN for x <= 0 +*/ +inline v8sf log256_ps(v8sf x) { + v8si imm0; + v8sf one = *(v8sf*)_ps256_1; + + //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); + v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); + + x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ + + // can be done with AVX2 + imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23); + + /* keep only the fractional part */ + x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); + x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); + + // this is again another AVX2 instruction + imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); + v8sf e = _mm256_cvtepi32_ps(imm0); + + e = _mm256_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); + v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf tmp = _mm256_and_ps(x, mask); + x = _mm256_sub_ps(x, one); + e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); + x = _mm256_add_ps(x, tmp); + + v8sf z = _mm256_mul_ps(x,x); + + v8sf y = *(v8sf*)_ps256_cephes_log_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); + y = _mm256_mul_ps(y, x); + + y = _mm256_mul_ps(y, z); + + tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); + y = _mm256_add_ps(y, tmp); + + + tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); + y = _mm256_sub_ps(y, tmp); + + tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); + x = _mm256_add_ps(x, y); + x = _mm256_add_ps(x, tmp); + x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); + +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS256_CONST(cephes_exp_C1, 0.693359375); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); + +inline v8sf exp256_ps(v8sf x) { + v8sf tmp = _mm256_setzero_ps(), fx; + v8si imm0; + v8sf one = *(v8sf*)_ps256_1; + + x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); + x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); + fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); + + /* how to perform a floorf with SSE: just below */ + //imm0 = _mm256_cvttps_epi32(fx); + //tmp = _mm256_cvtepi32_ps(imm0); + + tmp = _mm256_floor_ps(fx); + + /* if greater, subtract 1 */ + //v8sf mask = _mm256_cmpgt_ps(tmp, fx); + v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm256_and_ps(mask, one); + fx = _mm256_sub_ps(tmp, mask); + + tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); + v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); + x = _mm256_sub_ps(x, tmp); + x = _mm256_sub_ps(x, z); + + z = _mm256_mul_ps(x,x); + + v8sf y = *(v8sf*)_ps256_cephes_exp_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, x); + y = _mm256_add_ps(y, one); + + /* build 2^n */ + imm0 = _mm256_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); + imm0 = _mm256_slli_epi32(imm0, 23); + v8sf pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} + +_PS256_CONST(minus_cephes_DP1, -0.78515625); +_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS256_CONST(sincof_p0, -1.9515295891E-4); +_PS256_CONST(sincof_p1, 8.3321608736E-3); +_PS256_CONST(sincof_p2, -1.6666654611E-1); +_PS256_CONST(coscof_p0, 2.443315711809948E-005); +_PS256_CONST(coscof_p1, -1.388731625493765E-003); +_PS256_CONST(coscof_p2, 4.166664568298827E-002); +_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + + +/* evaluation of 8 sines at once using AVX intrinsics + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + +*/ +inline v8sf sin256_ps(v8sf x) { // any x + v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; + v8si imm0, imm2; + + sign_bit = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); + + /* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives + */ + + /* store the integer part of y in mm0 */ + imm2 = _mm256_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + // another two AVX2 instruction + imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); + imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1); + y = _mm256_cvtepi32_ps(imm2); + + /* get the swap sign flag */ + imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4); + imm0 = _mm256_slli_epi32(imm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 +#include + +namespace at::native { + +using weight_to_int4pack_fn = void (*)(const Tensor&, const Tensor&); +using int4pack_mm_fn = + void (*)(const Tensor&, const Tensor&, const Tensor&, int, const Tensor&); +using int8pack_mm_fn = + void (*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&); +using dyn_quant_pack_4bit_weight_fn = void (*)( + Tensor&, + const Tensor&, + const Tensor&, + const std::optional& bias, + const int64_t, + const int64_t, + const int64_t); +using dyn_quant_matmul_4bit_fn = void (*)( + const Tensor&, + const Tensor&, + const Tensor&, + const int64_t, + const int64_t, + const int64_t, + const int64_t); + +DECLARE_DISPATCH(weight_to_int4pack_fn, weight_to_int4pack_stub) +DECLARE_DISPATCH(int4pack_mm_fn, int4pack_mm_stub) +DECLARE_DISPATCH(int8pack_mm_fn, int8pack_mm_stub) +DECLARE_DISPATCH( + dyn_quant_pack_4bit_weight_fn, + dyn_quant_pack_4bit_weight_stub) +DECLARE_DISPATCH(dyn_quant_matmul_4bit_fn, dyn_quant_matmul_4bit_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h new file mode 100644 index 0000000000000000000000000000000000000000..dc915e107bfb4dd001d904952f156c410b7ddf54 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h @@ -0,0 +1,46 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +inline ScalarType first_type() { + return ScalarType::Undefined; +} + +template +inline ScalarType first_type(const Tensor& arg, const Args&... parameters) { + return arg.defined() ? arg.scalar_type() : first_type(parameters...); +} + +template +inline bool is_mixed_type(const Tensor& input, const Args&... parameters) { + const auto parameter_type = first_type(parameters...); + return ((parameter_type != ScalarType::Undefined) && + (parameter_type != input.scalar_type())); +} + +// currently on CPU, mixed data type is only supported +// when input is 'BFloat16' or 'Half' and parameters are 'Float' +inline void check_mixed_data_type(const Tensor& input) { + TORCH_CHECK(at::isReducedFloatingType(input.scalar_type()), + "mixed dtype (CPU): all inputs must share same datatype."); +} + +template +inline void check_mixed_data_type(const Tensor& input, const Tensor& parameter, const Args&... parameters) { + TORCH_CHECK(!parameter.defined() || parameter.scalar_type() == ScalarType::Float, + "mixed dtype (CPU): expect parameter to have scalar type of Float"); + check_mixed_data_type(input, parameters...); +} + +inline ScalarType param_scalar_type(const Tensor& t, bool is_mixed_type) { + return is_mixed_type ? ScalarType::Float : t.scalar_type(); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/moments_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/moments_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..fa31a27e798745e9a638473a30c97ff83510de7e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/moments_utils.h @@ -0,0 +1,216 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { + +template using opmath_t = at::opmath_type; + +constexpr int64_t kChunkSize = 16; + +template +void AddMoments( + int64_t m0_add, + const T& m1_add, + const T& m2_add, + int64_t& m0, + T& m1, + T& m2) { + const int64_t n = m0 + m0_add; + const T c = n == 0 ? static_cast(0) : static_cast(m0_add) / static_cast(n); + const T delta = m1_add - m1; + m1 += c * delta; + m2 += m2_add + delta * delta * c * static_cast(m0); + m0 = n; +} + +template +C10_ALWAYS_INLINE void AddMomentsVec( + int64_t m0_add, + const vec::Vectorized& m1_add, + const vec::Vectorized& m2_add, + int64_t& m0, + vec::Vectorized& m1, + vec::Vectorized& m2) { + using Vec = vec::Vectorized; + const int64_t n = m0 + m0_add; + const T c = n == 0 ? static_cast(0) : static_cast(m0_add) / static_cast(n); + const Vec c_vec(c); + const Vec delta = m1_add - m1; + const Vec m2_tmp = m2 + m2_add; + const Vec c_vec_delta = c_vec * delta; + const Vec m0_delta = delta * Vec(static_cast(m0)); + m1 = m1 + c_vec_delta; + m2 = fmadd(m0_delta, c_vec_delta, m2_tmp); + m0 = n; +} + +template +inline std::enable_if_t>, void> +UpdateMomentsVec( + int64_t m0, + const T* X_ptr, + const std::array>, kChunkSize>& c_vecs, + int64_t& m0_stk0, + vec::Vectorized>& m1_stk0, + vec::Vectorized>& m2_stk0) { + using Vec = vec::Vectorized>; + Vec m1_vec(0); + Vec m2_vec(0); + for (const auto j : c10::irange(m0)) { + const Vec x_vec = Vec::loadu(X_ptr + j * Vec::size()); + const Vec tmpVec = c_vecs[j]; + const Vec delta_vec = x_vec - m1_vec; + m1_vec = fmadd(tmpVec, delta_vec, m1_vec); + const Vec tmpVec2 = x_vec - m1_vec; + m2_vec = fmadd(delta_vec, tmpVec2, m2_vec); + } + AddMomentsVec(m0, m1_vec, m2_vec, m0_stk0, m1_stk0, m2_stk0); +} + +// each bfloat16/half vector will be converted to two float vectors, +// and accumulated successively on m1_stk0/m2_stk0. +template +inline std::enable_if_t>, void> +UpdateMomentsVec( + int64_t m0, + const T* X_ptr, + const std::array>, kChunkSize>& c_vecs, + int64_t& m0_stk0, + vec::Vectorized>& m1_stk0, + vec::Vectorized>& m2_stk0) { + using Vec = vec::Vectorized; + using fVec = vec::Vectorized>; + fVec m1_fvec0(0), m1_fvec1(0); + fVec m2_fvec0(0), m2_fvec1(0); + for (const auto j : c10::irange(m0)) { + const Vec x_bvec = Vec::loadu(X_ptr + j * Vec::size()); + const fVec tmpVec = c_vecs[j]; + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); + const fVec delta_fvec0 = x_fvec0 - m1_fvec0; + const fVec delta_fvec1 = x_fvec1 - m1_fvec1; + m1_fvec0 = fmadd(delta_fvec0, tmpVec, m1_fvec0); + m1_fvec1 = fmadd(delta_fvec1, tmpVec, m1_fvec1); + const fVec delta_fvec2 = x_fvec0 - m1_fvec0; + const fVec delta_fvec3 = x_fvec1 - m1_fvec1; + m2_fvec0 = fmadd(delta_fvec0, delta_fvec2, m2_fvec0); + m2_fvec1 = fmadd(delta_fvec1, delta_fvec3, m2_fvec1); + } + AddMomentsVec(m0, m1_fvec0, m2_fvec0, m0_stk0, m1_stk0, m2_stk0); + AddMomentsVec(m0, m1_fvec1, m2_fvec1, m0_stk0, m1_stk0, m2_stk0); +} + +// Compute rowwise moments by Welford algorithm and cascade sum to improve +// numerical stability. +// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance +// https://en.wikipedia.org/wiki/Pairwise_summation +template +std::pair, opmath_t> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) { + using math_t = opmath_t; + + constexpr int64_t kVecSize = vec::Vectorized::size(); + constexpr int64_t kAccVecSize = vec::Vectorized::size(); + const int64_t n = N / kVecSize; + const int64_t m = divup(n, kChunkSize); + const int64_t depth = utils::CeilLog2(m); + + using Vec = vec::Vectorized; + const Vec kZeroVec(math_t(0)); + std::array m0_stk = {{0}}; + std::array m1_stk; + m1_stk.fill(kZeroVec); + std::array m2_stk; + m2_stk.fill(kZeroVec); + + for (const auto i : c10::irange(m)) { + const T* X_ptr = X + i * kChunkSize * kVecSize; + const int64_t m0 = std::min(kChunkSize, n - i * kChunkSize); + static std::array c_vecs = ([]() { + std::array result; + for (const auto i : c10::irange(kChunkSize)) { + result[i] = Vec(math_t(1) / static_cast(i + 1)); + } + return result; + })(); + UpdateMomentsVec(m0, X_ptr, c_vecs, m0_stk[0], m1_stk[0], m2_stk[0]); + + int64_t mask = i + 1; + for (int64_t j = 1; j < depth && (mask & 1) == 0; ++j) { + AddMomentsVec( + m0_stk[j - 1], + m1_stk[j - 1], + m2_stk[j - 1], + m0_stk[j], + m1_stk[j], + m2_stk[j]); + m0_stk[j - 1] = 0; + m1_stk[j - 1] = kZeroVec; + m2_stk[j - 1] = kZeroVec; + mask >>= 1; + } + } + for (const auto i : c10::irange(1, depth)) { + AddMomentsVec( + m0_stk[i], m1_stk[i], m2_stk[i], m0_stk[0], m1_stk[0], m2_stk[0]); + } + + std::array m1_arr{}; + std::array m2_arr{}; + m1_stk[0].store(m1_arr.data()); + m2_stk[0].store(m2_arr.data()); + + int64_t m0 = 0; + math_t m1 = 0; + math_t m2 = 0; + for (int64_t i = n * kVecSize; i < N; ++i) { + math_t x = static_cast(X[i]); + const math_t delta = x - m1; + ++m0; + m1 += delta / static_cast(m0); + m2 += delta * (x - m1); + } + // for BFloat16, each vector in m1_arr/m2_arr holds 2*n accumulated result + int64_t m0_add = n * kVecSize / kAccVecSize; + for (const auto i : c10::irange(kAccVecSize)) { + AddMoments(m0_add, m1_arr[i], m2_arr[i], m0, m1, m2); + } + + return std::make_pair(m1, m2 / static_cast(N - ddof)); +} + +template +std::pair, opmath_t> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) { + using Vec = vec::Vectorized; + constexpr int64_t kVecSize = Vec::size(); + const int64_t n = N / kVecSize; + const int64_t m = divup(n, kChunkSize); + const int64_t depth = utils::CeilLog2(m); + if (depth <= 4) { + return RowwiseMomentsImpl(X, N, ddof); + } else if (depth <= 8) { + return RowwiseMomentsImpl(X, N, ddof); + } else if (depth <= 16) { + return RowwiseMomentsImpl(X, N, ddof); + } else if (depth <= 32) { + return RowwiseMomentsImpl(X, N, ddof); + } else { + return RowwiseMomentsImpl(X, N, ddof); + } +} + +} // namespace CPU_CAPABILITY +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..492d89906e3657bb247f4b888b06e3aa5eeb5fee --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/utils.h @@ -0,0 +1,225 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#ifdef USE_FBGEMM +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") +#include +C10_DIAGNOSTIC_POP() +#endif + +namespace at::native { + +template +inline void _store(T* dst, at::vec::Vectorized src) { + src.store(dst); +} + +inline void _store(at::BFloat16* dst, at::vec::Vectorized src) { + auto res = at::vec::convert_float_bfloat16(src, src); + res.store(dst, at::vec::Vectorized::size()); +} + +inline void _store(at::Half* dst, at::vec::Vectorized src) { + auto res = at::vec::convert_float_half(src, src); + res.store(dst, at::vec::Vectorized::size()); +} + +inline namespace CPU_CAPABILITY { + +template +inline T data_index_init(T offset) { + return offset; +} + +template +inline T data_index_init(T offset, T& x, const T& X, Args&&... args) { + offset = data_index_init(offset, std::forward(args)...); + x = offset % X; + return offset / X; +} + +inline bool data_index_step() { + return true; +} + +template +inline bool data_index_step(T& x, const T& X, Args&&... args) { + if (data_index_step(std::forward(args)...)) { + x = ((x + 1) == X) ? 0 : (x + 1); + return x == 0; + } + return false; +} + +// Helper struct for bfloat16/float16 vectorization +// Useful when you need float as immediate dtype or accumulate dtype +using namespace vec; +struct Vec2 { + Vectorized val0, val1; + Vec2(Vectorized v0, Vectorized v1) : val0(v0), val1(v1) {} + Vec2(float v) : val0(v), val1(v) {} + static Vec2 loadu(const BFloat16* ptr) { + auto [v0, v1] = convert_bfloat16_float(Vectorized::loadu(ptr)); + return {v0, v1}; + } + static Vec2 loadu(const Half* ptr) { + auto [v0, v1] = convert_half_float(Vectorized::loadu(ptr)); + return {v0, v1}; + } + static Vec2 loadu(const float* ptr) { + return {Vectorized::loadu(ptr), Vectorized::loadu(ptr + Vectorized::size())}; + } + void store(BFloat16* ptr) const { + Vectorized val = convert_float_bfloat16(val0, val1); + val.store(ptr); + } + void store(Half* ptr) const { + Vectorized val = convert_float_half(val0, val1); + val.store(ptr); + } + void store(float* ptr) const { + val0.store(ptr); + val1.store(ptr + Vectorized::size()); + } +}; +inline Vec2 operator+(const Vec2& a, const Vec2& b) { return {a.val0 + b.val0, a.val1 + b.val1}; } +inline Vec2 operator*(const Vec2& a, const Vec2& b) { return {a.val0 * b.val0, a.val1 * b.val1}; } +inline Vec2 operator-(const Vec2& a, const Vec2& b) { return {a.val0 - b.val0, a.val1 - b.val1}; } +inline Vec2 operator/(const Vec2& a, const Vec2& b) { return {a.val0 / b.val0, a.val1 / b.val1}; } +inline Vec2 maximum(const Vec2& a, const Vec2& b) { return {vec::maximum(a.val0, b.val0), vec::maximum(a.val1, b.val1)}; } +inline Vec2 minimum(const Vec2& a, const Vec2& b) { return {vec::minimum(a.val0, b.val0), vec::minimum(a.val1, b.val1)}; } + +template struct VectorizedType { using type = Vectorized; }; +template <> struct VectorizedType { using type = Vec2; }; +template <> struct VectorizedType { using type = Vec2; }; +template using VecType = typename VectorizedType::type; + +// Helper for mixed data type parameter Vec::load +inline std::tuple, Vectorized> load2f(const BFloat16* ptr) { + return convert_bfloat16_float(Vectorized::loadu(ptr)); +} + +inline std::tuple, Vectorized> load2f(const Half* ptr) { + return convert_half_float(Vectorized::loadu(ptr)); +} + +inline std::tuple, Vectorized> load2f(const float* ptr) { + using Vec = Vectorized; + return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size())); +} + +inline std::tuple, Vectorized> load2f(const BFloat16* ptr, int64_t count) { + return convert_bfloat16_float(Vectorized::loadu(ptr, count)); +} + +inline std::tuple, Vectorized> load2f(const Half* ptr, int64_t count) { + return convert_half_float(Vectorized::loadu(ptr, count)); +} + +inline std::tuple, Vectorized> load2f(const float* ptr, int64_t count) { + using Vec = Vectorized; + if (count > Vec::size()) { + return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size(), count - Vec::size())); + } else { + return std::make_tuple(Vec::loadu(ptr, count), Vec(0)); + } +} + +} // namespace + +namespace utils { + +template +T CeilLog2(const T& x) { + if (x <= 2) { + return 1; + } + // Last set bit is floor(log2(x)), floor + 1 is ceil + // except when x is an exact powers of 2, so subtract 1 first + return static_cast(llvm::findLastSet(static_cast(x) - 1)) + 1; +} + +// matrix transpose: +// src has shape of M by N, with leading dimension of ld_src +// dst has shape of N by M, with leading dimension of ld_dst +template +inline void transpose(int64_t M, int64_t N, const T* src, int64_t ld_src, T* dst, int64_t ld_dst) { + for (int64_t j = 0; j < N; j++) { + for (int64_t i = 0; i < M; i++) { + dst[j * ld_dst + i] = c10::load(&(src[i * ld_src + j])); + } + } +} + +#ifdef USE_FBGEMM +template <> +inline void transpose(int64_t M, int64_t N, const float* src, int64_t ld_src, float* dst, int64_t ld_dst) { + TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); + fbgemm::transpose_simd(M, N, src, ld_src, dst, ld_dst); +} + +template <> +inline void transpose(int64_t M, int64_t N, const uint16_t* src, int64_t ld_src, uint16_t* dst, int64_t ld_dst) { + TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); + fbgemm::transpose_simd(M, N, src, ld_src, dst, ld_dst); +} + +template <> +inline void transpose(int64_t M, int64_t N, const uint8_t* src, int64_t ld_src, uint8_t* dst, int64_t ld_dst) { + TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); + fbgemm::transpose_simd(M, N, src, ld_src, dst, ld_dst); +} +#endif + +template +inline void parallel_sparse_csr( + const TensorAccessor& crow_acc, + const int64_t M, + const int64_t nnz, + const F& f) { + TORCH_CHECK(crow_acc.size(0) == M + 1); + + // directly parallel on `M` may lead to load imbalance, + // statically determine thread partition here to average payload + // for each thread. + int num_threads = at::get_num_threads(); + std::vector thread_splits(num_threads + 1, M); + + int64_t thread_averge_payload = std::max((int64_t)1, divup(nnz, num_threads)); + + thread_splits[0] = 0; + int64_t sum = 0; + int64_t t = 1; + for (const auto m : c10::irange(M)) { + int64_t row_start = crow_acc[m]; + int64_t row_end = crow_acc[m + 1]; + sum += row_end - row_start; + if (sum > t * thread_averge_payload) { + thread_splits[t] = m; + t++; + } + } + // need to restore the last index, + // due to rounding error when calculating `thread_averge_payload`. + thread_splits[num_threads] = M; + + at::parallel_for(0, num_threads, 1, [&](int64_t cbegin, int64_t cend) { + int tid = at::get_thread_num(); + int64_t begin = thread_splits[tid]; + int64_t end = thread_splits[tid + 1]; + f(begin, end); + }); +} + +} // namespace utils + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/zmath.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/zmath.h new file mode 100644 index 0000000000000000000000000000000000000000..81b7978bf77b3da919aa64f20eee91de249b21ec --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cpu/zmath.h @@ -0,0 +1,255 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// Complex number math operations that act as no-ops for other dtypes. +#include +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { + +template +inline VALUE_TYPE zabs (SCALAR_TYPE z) { + return z; +} + +template<> +inline c10::complex zabs > (c10::complex z) { + return c10::complex(std::abs(z)); +} + +template<> +inline float zabs , float> (c10::complex z) { + return std::abs(z); +} + +template<> +inline c10::complex zabs > (c10::complex z) { + return c10::complex(std::abs(z)); +} + +template<> +inline double zabs , double> (c10::complex z) { + return std::abs(z); +} + +// This overload corresponds to non-complex dtypes. +// The function is consistent with its NumPy equivalent +// for non-complex dtypes where `pi` is returned for +// negative real numbers and `0` is returned for 0 or positive +// real numbers. +// Note: `nan` is propagated. +template +inline VALUE_TYPE angle_impl (SCALAR_TYPE z) { + if (at::_isnan(z)) { + return z; + } + return z < 0 ? c10::pi : 0; +} + +template<> +inline c10::complex angle_impl > (c10::complex z) { + return c10::complex(std::arg(z), 0.0); +} + +template<> +inline float angle_impl , float> (c10::complex z) { + return std::arg(z); +} + +template<> +inline c10::complex angle_impl > (c10::complex z) { + return c10::complex(std::arg(z), 0.0); +} + +template<> +inline double angle_impl , double> (c10::complex z) { + return std::arg(z); +} + +template +constexpr VALUE_TYPE real_impl (SCALAR_TYPE z) { + return z; //No-Op +} + +template<> +constexpr c10::complex real_impl > (c10::complex z) { + return c10::complex(z.real(), 0.0); +} + +template<> +constexpr float real_impl , float> (c10::complex z) { + return z.real(); +} + +template<> +constexpr c10::complex real_impl > (c10::complex z) { + return c10::complex(z.real(), 0.0); +} + +template<> +constexpr double real_impl , double> (c10::complex z) { + return z.real(); +} + +template +constexpr VALUE_TYPE imag_impl (SCALAR_TYPE /*z*/) { + return 0; +} + +template<> +constexpr c10::complex imag_impl > (c10::complex z) { + return c10::complex(z.imag(), 0.0); +} + +template<> +constexpr float imag_impl , float> (c10::complex z) { + return z.imag(); +} + +template<> +constexpr c10::complex imag_impl > (c10::complex z) { + return c10::complex(z.imag(), 0.0); +} + +template<> +constexpr double imag_impl , double> (c10::complex z) { + return z.imag(); +} + +template +inline TYPE conj_impl (TYPE z) { + return z; //No-Op +} + +template<> +inline c10::complex conj_impl > (c10::complex z) { + return c10::complex{z.real(), -z.imag()}; +} + +template<> +inline c10::complex conj_impl > (c10::complex z) { + return c10::complex(z.real(), -z.imag()); +} + +template<> +inline c10::complex conj_impl > (c10::complex z) { + return c10::complex(z.real(), -z.imag()); +} + +template +inline TYPE ceil_impl (TYPE z) { + return std::ceil(z); +} + +template <> +inline c10::complex ceil_impl (c10::complex z) { + return c10::complex(std::ceil(z.real()), std::ceil(z.imag())); +} + +template <> +inline c10::complex ceil_impl (c10::complex z) { + return c10::complex(std::ceil(z.real()), std::ceil(z.imag())); +} + +template +inline c10::complex sgn_impl (c10::complex z) { + if (z == c10::complex(0, 0)) { + return c10::complex(0, 0); + } else { + return z / zabs(z); + } +} + +template +inline TYPE floor_impl (TYPE z) { + return std::floor(z); +} + +template <> +inline c10::complex floor_impl (c10::complex z) { + return c10::complex(std::floor(z.real()), std::floor(z.imag())); +} + +template <> +inline c10::complex floor_impl (c10::complex z) { + return c10::complex(std::floor(z.real()), std::floor(z.imag())); +} + +template +inline TYPE round_impl (TYPE z) { + return std::nearbyint(z); +} + +template <> +inline c10::complex round_impl (c10::complex z) { + return c10::complex(std::nearbyint(z.real()), std::nearbyint(z.imag())); +} + +template <> +inline c10::complex round_impl (c10::complex z) { + return c10::complex(std::nearbyint(z.real()), std::nearbyint(z.imag())); +} + +template +inline TYPE trunc_impl (TYPE z) { + return std::trunc(z); +} + +template <> +inline c10::complex trunc_impl (c10::complex z) { + return c10::complex(std::trunc(z.real()), std::trunc(z.imag())); +} + +template <> +inline c10::complex trunc_impl (c10::complex z) { + return c10::complex(std::trunc(z.real()), std::trunc(z.imag())); +} + +template ::value, int> = 0> +inline TYPE max_impl (TYPE a, TYPE b) { + if (_isnan(a) || _isnan(b)) { + return std::numeric_limits::quiet_NaN(); + } else { + return std::max(a, b); + } +} + +template ::value, int> = 0> +inline TYPE max_impl (TYPE a, TYPE b) { + if (_isnan(a)) { + return a; + } else if (_isnan(b)) { + return b; + } else { + return std::abs(a) > std::abs(b) ? a : b; + } +} + +template ::value, int> = 0> +inline TYPE min_impl (TYPE a, TYPE b) { + if (_isnan(a) || _isnan(b)) { + return std::numeric_limits::quiet_NaN(); + } else { + return std::min(a, b); + } +} + +template ::value, int> = 0> +inline TYPE min_impl (TYPE a, TYPE b) { + if (_isnan(a)) { + return a; + } else if (_isnan(b)) { + return b; + } else { + return std::abs(a) < std::abs(b) ? a : b; + } +} + +} // end namespace +} //end at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Activation.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Activation.h new file mode 100644 index 0000000000000000000000000000000000000000..4ca11a5566b860dbfe7866db7c10b532e9ab0537 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Activation.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +struct TensorIteratorBase; +class TensorBase; +} + +namespace at::native { + +void launch_glu_backward_kernel(const TensorIteratorBase& iter, + int64_t gI_stride, int64_t I_stride); + +void launch_log_sigmoid_forward_kernel(TensorIteratorBase& iter); + +void GeluCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate); +void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h new file mode 100644 index 0000000000000000000000000000000000000000..1002f53b0793f655db825b5203e500a93af33615 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/BinaryInternal.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// DON'T include this except from Binary*.cu files. It should not leak into +// headers. +#pragma once +#define TORCH_ASSERT_NO_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at::native::binary_internal { + +template +struct DivFunctor { + __device__ scalar_t operator()(scalar_t a, scalar_t b) const { + return a / b; + } +}; + +template +struct MulFunctor { + __device__ T operator()(T a, T b) const { + return a * b; + } +}; + +// Workaround for the error: '*' in boolean context, suggest '&&' instead +// [-Werror=int-in-bool-context] +template <> +struct MulFunctor { + __device__ bool operator()(bool a, bool b) const { + return a && b; + } +}; +void div_true_kernel_cuda(TensorIteratorBase& iter); +void div_trunc_kernel_cuda(TensorIteratorBase& iter); +} // namespace at::native::binary_internal + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..5540b6143da661f7071a497610c0faa2919513c2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh @@ -0,0 +1,332 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +// Jiterator functions are guarded behind this macro +#if AT_USE_JITERATOR() + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace at::native { + +template +// warning : unused parameter when tuple is empty. +constexpr auto tuple_to_array_helper(const Tuple& t [[maybe_unused]], std::index_sequence seq) { + constexpr auto size = seq.size(); + return std::array{static_cast(&std::get(t))...}; +} + +// Helper function convert tuple to std::array +// for passing the arguments to CUDA Kernel +// NOTE: We capture tuple by reference, +// so the pointers in returned array are only valid +// till tuple is alive. +template +constexpr auto tuple_to_array(const std::tuple& extra_args) { + constexpr auto tuple_size = sizeof...(Args); + return tuple_to_array_helper(extra_args, std::make_index_sequence{}); +} + +struct JittedVecKernelCache { + // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements) + at::cuda::jit::NvrtcFunction vec1; + at::cuda::jit::NvrtcFunction vec2; + at::cuda::jit::NvrtcFunction vec4; + at::cuda::jit::NvrtcFunction vec8; +#ifdef USE_ROCM + at::cuda::jit::NvrtcFunction vec16; +#endif + +}; + +struct JittedKernelVariantCache { + JittedVecKernelCache vec; + at::cuda::jit::NvrtcFunction noncontiguous; + at::cuda::jit::NvrtcFunction dynamic_contiguous; + at::cuda::jit::NvrtcFunction dynamic_noncontiguous; +}; + +inline c10::SmallBuffer pack_kernel_args( + std::initializer_list args, + c10::ArrayRef extra_args) { + c10::SmallBuffer ret(args.size() + extra_args.size()); + std::copy(args.begin(), args.end(), ret.data()); + std::copy(extra_args.begin(), extra_args.end(), ret.data() + args.size()); + return ret; +} + +template +void launch_jitted_unrolled_kernel( + std::mutex &jiterator_mutex, + at::cuda::jit::NvrtcFunction &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, + int64_t N, + array_t data, + inp_calc_t ic, + out_calc_t oc, + loader_t l, + storer_t s, + bool contiguous, + at::cuda::jit::BinaryFuncVariant scalar_pos, + const void* scalar_val, + c10::ArrayRef extra_args) { + + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + + int tws = at::cuda::jit::calc_thread_work_size(desc.nInputs, desc.nOutputs, desc.f_inputs_type, desc.result_type); + int bws = tws * num_threads(); + //casting result to int is always safe, intermediate is int64 and won't overflow + const uint32_t grid = (N + bws - 1) / bws; + + if (!fn_cache.function) { + const std::lock_guard lock{jiterator_mutex}; + if (!fn_cache.function) { + constexpr bool dynamic_casting = !std::is_same() || + !std::is_same(); + auto code = at::cuda::jit::generate_code( + desc, contiguous, dynamic_casting, scalar_pos, tws); + fn_cache = at::cuda::jit::jit_pwise_function(code, desc.name); + } + } + + auto args = pack_kernel_args({&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args); + at::cuda::jit::launch_jitted_pwise_function(fn_cache, args.data(), {grid, 1u, 1u}, + {num_threads(), 1u, 1u}); +} + +template +void launch_jitted_vectorized_kernel( + std::mutex &jiterator_mutex, JittedVecKernelCache &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, int64_t N, array_t data, + at::cuda::jit::BinaryFuncVariant scalar_pos, + const void *scalar_val, c10::ArrayRef extra_args) { + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + + int tws = at::cuda::jit::calc_thread_work_size(desc.nInputs, desc.nOutputs, desc.f_inputs_type, desc.result_type); + int bws = tws * num_threads(); + // N is still int64_t for the computation, but it's always safe to cast result to int + const uint32_t grid = (N + bws - 1) / bws; + + int vec_size = at::cuda::jit::can_vectorize_up_to( + desc, c10::ArrayRef(data.data(), data.size())); + +#ifndef USE_ROCM + const auto input_size = c10::scalarTypeToTypeMeta(desc.f_inputs_type).itemsize(); + const int optimal_vec_size = 16 / static_cast(input_size); + vec_size = std::min(optimal_vec_size, vec_size); + // Here we purposely omit vec8 for 1-byte data because of a bug in NVCC + // that causes some numerical mismatches with uint8 on sm80 and sm90. + // TODO: Revisit this after CUDA 12.8 update. + if (input_size < 2) { + vec_size = std::min(vec_size, 4); + } +#endif + + // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements) + // fn_ptr is set to the appropriate function based on the vec size and GPU used + at::cuda::jit::NvrtcFunction* fn_ptr = nullptr; + +#ifdef USE_ROCM + if (vec_size == 16) { + fn_ptr = &fn_cache.vec16; + } else +#endif + if (vec_size == 8) { + fn_ptr = &fn_cache.vec8; + } else if (vec_size == 4) { + fn_ptr = &fn_cache.vec4; + } else if (vec_size == 2) { + fn_ptr = &fn_cache.vec2; + } else if (vec_size ==1) { + fn_ptr = &fn_cache.vec1; + } else { + TORCH_INTERNAL_ASSERT(false, "unexpected vec_size for jitter vectorized kernel"); + } + + bool vectorized = vec_size > 1; + + if (!fn_ptr->function) { + const std::lock_guard lock{jiterator_mutex}; + if (!fn_ptr->function) { // cache miss! + + // Generates program + auto code = at::cuda::jit::generate_code( + desc, /*contiguous=*/true, /*dynamic_casting=*/false, + scalar_pos, tws, vectorized, vec_size); + std::string kernel_name = vectorized ? desc.name + "_vectorized" + std::to_string(vec_size) : desc.name; + + // Acquires the program + *fn_ptr = at::cuda::jit::jit_pwise_function(code, kernel_name); + } + } + + if (vectorized) { + auto args = pack_kernel_args({&N, &data, scalar_val}, extra_args); + at::cuda::jit::launch_jitted_pwise_function( + *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u}); + } else { +// NVCC complains about unused variables l and s. +// It should be false positive in most cases, so we suppress the warnings. +#pragma nv_diagnostic push +#pragma nv_diag_suppress 177 + auto ic = TrivialOffsetCalculator(); + auto oc = TrivialOffsetCalculator<1>(); + auto l = memory::LoadWithoutCast(); + auto s = memory::StoreWithoutCast(); + + auto args = pack_kernel_args( + {&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args); + at::cuda::jit::launch_jitted_pwise_function( + *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u}); +#pragma nv_diagnostic pop + } +} + +template +void jitted_gpu_kernel_generic( + std::mutex &jiterator_mutex, + JittedKernelVariantCache &cache, + const at::cuda::jit::KernelDescriptor &desc, + at::cuda::jit::BinaryFuncVariant scalar_pos, + c10::ArrayRef extra_args, + TensorIteratorBase& iter, + const bool dynamic_casting, + const void *scalar_val) { + TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); + TORCH_INTERNAL_ASSERT(iter.ninputs() == arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + + constexpr int ntensors = arity + 1; + std::array data; + for (auto i : c10::irange(ntensors)) { + data[i] = (char*)iter.data_ptr(i); + } + + int64_t numel = iter.numel(); + bool contiguous = iter.is_contiguous(); + + // Decides which of 4 kernel types to launch + // Variations are: + // - Case 1: no dynamic casting and contiguous + // - Case 2: no dynamic casting and noncontiguous + // - Case 3: dynamic casting and contiguous + // - Case 4: dynamic casting and noncontiguous + // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl + + if (!dynamic_casting) { + if (contiguous) { + // Case 1: no dynamic casting and contiguous + launch_jitted_vectorized_kernel( + jiterator_mutex, cache.vec, desc, + numel, data, scalar_pos, scalar_val, extra_args); + return; + } + + // Case 2: no dynamic casting and noncontiguous + auto input_offset_calculator = make_input_offset_calculator(iter); + auto output_offset_calculator = make_output_offset_calculator(iter); + auto loader = memory::LoadWithoutCast(); + auto storer = memory::StoreWithoutCast(); + launch_jitted_unrolled_kernel( + jiterator_mutex, cache.noncontiguous, desc, numel, data, + input_offset_calculator, output_offset_calculator, loader, + storer, contiguous, scalar_pos, scalar_val, extra_args); + return; + } + + // Cases 3 and 4 are handled below + // Both require construction of a storer (this asserts 1 output) and one or more loaders + + // Creates store cast to output (the zeroth tensor in TensorIterator) + auto storer = memory::StoreWithCast<1>(iter); + + // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors) + auto loader = memory::LoadWithCast(iter); + + if (contiguous) { + // Case 3: dynamic casting and contiguous + auto input_offset_calculator = TrivialOffsetCalculator(); + auto output_offset_calculator = TrivialOffsetCalculator<1>(); + launch_jitted_unrolled_kernel( + jiterator_mutex, cache.dynamic_contiguous, desc, numel, data, input_offset_calculator, + output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args); + return; + } + + // Case 4: dynamic casting and noncontiguous + auto input_offset_calculator = make_input_offset_calculator(iter); + auto output_offset_calculator = make_output_offset_calculator(iter); + launch_jitted_unrolled_kernel( + jiterator_mutex, cache.dynamic_noncontiguous, desc, numel, data, input_offset_calculator, + output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args); +} + +// NOTE: static to reduce chances of name collision. +template < + char const* name, + typename result_type, + typename f_inputs_type, + int arity, + at::cuda::jit::BinaryFuncVariant scalar_pos = + at::cuda::jit::BinaryFuncVariant::NoScalar, + typename... ExtraArgs> +static void jitted_gpu_kernel_impl( + TensorIteratorBase& iter, + const std::string &f, + const bool dynamic_casting, + at::opmath_type scalar_val, + const std::tuple& extra_args) { + + // TODO: Memory use can probably be optimized by reusing kernels across GPUs with + // the same compute capability + static std::mutex jiterator_mutex; + static std::vector device_caches(c10::cuda::device_count()); + + constexpr int nInputs = arity; + constexpr int nOutputs = 1; // TODO: Support more than 1 output + static const auto desc = at::cuda::jit::make_kernel_descriptor< + result_type, f_inputs_type, ExtraArgs...>(name, f, nInputs, nOutputs); + + auto &cache = device_caches[iter.device().index()]; + auto extra_args_array = tuple_to_array(extra_args); + return jitted_gpu_kernel_generic( + jiterator_mutex, + cache, + desc, + scalar_pos, + extra_args_array, + iter, + dynamic_casting, + &scalar_val + ); +} + +} // at::native + +#endif // AT_USE_JITERATOR() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CUDALoops.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CUDALoops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..37684e127bed6702510109919f38de7a5528f5d3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CUDALoops.cuh @@ -0,0 +1,1136 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// This file provides two functions to help write GPU elementwise kernels: +// +// gpu_kernel(TensorIterator iter, ) +// gpu_kernel_with_scalars(TensorIterator iter, ) +// +// The gpu_kernel_with_scalars generates specializations that support a +// single scalar CPU argument, such as from `cuda_tensor + 5`. The CPU scalar +// is lifted to a kernel parameter instead of copying to device memory. +// This should be used in conjunction with TensorIterator::allow_cpu_scalars_, +// which is the default for TensorIterator::binary_op. Otherwise, all inputs +// and the output must be on the GPU. +// +// For example, to write a reciprocal kernel for GPU float Tensors: +// +// gpu_kernel(iter, []GPU_LAMBDA(float a) { +// return 1.0f / a; +// }); +// +// To write a multiplication kernel for GPU float Tensors where one argument +// may be a CPU scalar: +// +// gpu_kernel_with_scalars(iter, []GPU_LAMBDA(float a, float b) { +// return a * b; +// }); +// +// See BinaryOpsKernel.cu for the complete implementation +// + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __NVCC__ +#define ASSERT_HOST_DEVICE_LAMBDA(type) \ + static_assert( \ + __nv_is_extended_host_device_lambda_closure_type(type), \ + #type " must be a __host__ __device__ lambda") +#else +#define ASSERT_HOST_DEVICE_LAMBDA(type) +#endif + +namespace at::native { + +#ifdef USE_ROCM +// Custom configuration for vectorized elementwise kernel +// with template instantiation. +namespace vectorized_templated_config { +constexpr int num_threads() { + return 512; +} + +constexpr int elems_per_thread() { + return 32; +} + +constexpr int block_work_size() { + return elems_per_thread() * num_threads(); +} +} // namespace vectorized_templated_config +#endif + +template +constexpr auto sum_of_sizes(args_t args, std::index_sequence) { + if constexpr (sizeof...(Is) == 0) { + return 0; + } else { + return (sizeof(std::tuple_element_t) + ...); + } +} + +#ifdef USE_ROCM +template +constexpr auto elems_per_thread(){ + if constexpr (io_sizes == 1) { + return 16; + } else if constexpr (io_sizes < 4) { + return 8; + } else { + return 4; + } +} +#else +template +constexpr auto elems_per_thread(){ + if constexpr (io_sizes == 1) { + return 16; + } else { + return 8; + } +} +#endif + + +//thread work size of 8 regresses the perf of elementwise kernel on cuda +//this doesn't change ROCm behavior as thread_work_size is already 4 on ROCm +constexpr int elementwise_thread_work_size() {return 4;} +constexpr int elementwise_block_work_size() { + return elementwise_thread_work_size() * num_threads(); +} + +template +constexpr auto io_block_work_size() { + return num_threads() * elems_per_thread(); +} + +#ifdef USE_ROCM +template +constexpr auto input_size(args_t args, std::index_sequence) { + if constexpr (sizeof...(Is) == 0) { + return 0; + } else { + return sizeof(std::tuple_element_t<0, args_t>); + } +} + +template +constexpr auto calc_optimal_vec_size() { + static_assert(vec_size != 0); + static_assert(io_size != 0); + if constexpr (io_size == 1 && vec_size >= 16) { + return 16; + } else if constexpr (io_size <= 2 && vec_size >= 8) { + return 8; + } else if constexpr (io_size <= 4 && vec_size >= 4) { + return 4; + } else if constexpr (vec_size >= 4) { + return 4; + } else if constexpr (vec_size >= 2) { + return 2; + } else { + return 1; + } +} +#endif + +template +constexpr auto calc_io_size(){ + using traits = function_traits; + using args_t = typename traits::ArgsTuple; +#ifdef USE_ROCM + constexpr auto input_size = at::native::input_size(args_t{}, std::make_index_sequence>{}); + constexpr auto output_size = sizeof(typename traits::result_type); + return (input_size > 0) ? ((input_size < output_size) ? input_size : output_size) : output_size; +#else + constexpr auto input_size = at::native::sum_of_sizes(args_t{}, std::make_index_sequence>{}); + constexpr auto output_size = sizeof(typename traits::result_type); + return input_size + output_size; +#endif +} + +#ifndef USE_ROCM +// To save on binary size of libtorch_cuda.so, we split the vectorized_elementwise_kernel +// into two: one for vec_size=8 and one for vec_size=[2, 4], since vec8 is going to be +// used on sm_90 and sm_100 exclusively. +template +C10_LAUNCH_BOUNDS_1(num_threads()) +__global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) { + if constexpr (vec_size == 8) { +#if __CUDA_ARCH__ == 900 || __CUDA_ARCH__ == 1000 + using traits = function_traits; + constexpr auto io_size = calc_io_size(); + int remaining = N - io_block_work_size() * blockIdx.x; + + if (remaining < io_block_work_size()) { // if this block handles the reminder, + // just do a naive unrolled loop + auto input_calc = TrivialOffsetCalculator(); + auto output_calc = TrivialOffsetCalculator<1>(); + auto loader = memory::LoadWithoutCast(); + auto storer = memory::StoreWithoutCast(); + auto policy = memory::policies::unroll< + array_t, + decltype(input_calc), + decltype(output_calc), + memory::LoadWithoutCast, + memory::StoreWithoutCast, + elems_per_thread()>( + data, remaining, input_calc, output_calc, loader, storer); + elementwise_kernel_helper(f, policy); + } else { // if this block has a full `block_work_size` data to handle, use + // vectorized memory access + elementwise_kernel_helper( + f, memory::policies::vectorized()>(data)); + } +#endif // __CUDA_ARCH__ == 900 || __CUDA_ARCH__ == 1000 + } else { + using traits = function_traits; + constexpr auto io_size = calc_io_size(); + int remaining = N - io_block_work_size() * blockIdx.x; + + if (remaining < io_block_work_size()) { // if this block handles the reminder, + // just do a naive unrolled loop + auto input_calc = TrivialOffsetCalculator(); + auto output_calc = TrivialOffsetCalculator<1>(); + auto loader = memory::LoadWithoutCast(); + auto storer = memory::StoreWithoutCast(); + auto policy = memory::policies::unroll< + array_t, + decltype(input_calc), + decltype(output_calc), + memory::LoadWithoutCast, + memory::StoreWithoutCast, + elems_per_thread()>( + data, remaining, input_calc, output_calc, loader, storer); + elementwise_kernel_helper(f, policy); + } else { // if this block has a full `block_work_size` data to handle, use + // vectorized memory access + elementwise_kernel_helper( + f, memory::policies::vectorized()>(data)); + } + } +} + +#else // USE_ROCM +template +C10_LAUNCH_BOUNDS_1(num_threads()) +__global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) { + using traits = function_traits; + constexpr auto io_size = calc_io_size(); +#if defined(USE_ROCM) && defined(__gfx942__) + // Similar check in launch_vectorized_kernel() as well. Both should be in sync. + constexpr int tws = 16; +#else + constexpr int tws = elems_per_thread(); +#endif + constexpr int bws = tws * num_threads(); + int remaining = N - bws * blockIdx.x; + + if (remaining < bws) { // if this block handles the reminder, + // just do a naive unrolled loop + auto input_calc = TrivialOffsetCalculator(); + auto output_calc = TrivialOffsetCalculator<1>(); + auto loader = memory::LoadWithoutCast(); + auto storer = memory::StoreWithoutCast(); + auto policy = memory::policies::unroll< + array_t, + decltype(input_calc), + decltype(output_calc), + memory::LoadWithoutCast, + memory::StoreWithoutCast, + tws>( + data, remaining, input_calc, output_calc, loader, storer); + elementwise_kernel_helper(f, policy); + } else { // if this block has a full `block_work_size` data to handle, use + // vectorized memory access + constexpr auto optimal_vec_size = calc_optimal_vec_size(); + elementwise_kernel_helper( + f, memory::policies::vectorized(data)); + } +} +#endif // USE_ROCM + +template < + typename func_t, + typename array_t, + int elems_per_thread, + typename inp_calc_t, + typename out_calc_t, + typename loader_t, + typename storer_t> +C10_LAUNCH_BOUNDS_1(num_threads()) +__global__ void unrolled_elementwise_kernel( + int N, + func_t f, + array_t data, + inp_calc_t ic, + out_calc_t oc, + loader_t l, + storer_t s) { + int remaining = N - elems_per_thread * num_threads() * blockIdx.x; + auto policy = memory::policies:: + unroll( + data, remaining, ic, oc, l, s); + elementwise_kernel_helper(f, policy); +} + +// this function assume trivial 1d and no dynamic casting +template +static inline void launch_vectorized_kernel( + int64_t N, + const func_t& f, + array_t data) { + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + using traits = function_traits; + constexpr auto io_size = calc_io_size(); + auto stream = at::cuda::getCurrentCUDAStream(); +#ifdef USE_ROCM + int vec_size = memory::can_vectorize_up_to(data); + c10::DeviceIndex curDevice = -1; + AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice)); + // Similar check in vectorized_elementwise_kernel() as well. Both should be in sync. + int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? 16 : elems_per_thread(); +#else + using cpp_type = typename function_traits::result_type; + const uint16_t max_vec_size = memory::can_vectorize_up_to(data); + uint16_t vec_size = 16 / static_cast(sizeof(cpp_type)); + vec_size = std::min(vec_size, max_vec_size); + // Here we purposely omit vec8 for 1-byte data because of a bug in NVCC + // that causes some numerical mismatches with uint8 on sm80 and sm90. + // TODO: Revisit this after CUDA 12.8 update. + cudaDeviceProp* p = at::cuda::getDeviceProperties(stream.device().index()); + const int computeCapability = p->major * 10 + p->minor; + if (computeCapability != 90 && computeCapability != 100) { + vec_size = std::min(vec_size, 4); + } + if constexpr (sizeof(cpp_type) < 2) { + vec_size = std::min(vec_size, 4); + } + int tws = elems_per_thread(); +#endif + int bws = tws * num_threads(); + int64_t grid = (N + bws - 1) / bws; + switch (vec_size) { +#ifdef USE_ROCM + case 16: + vectorized_elementwise_kernel<16, func_t, array_t> + <<>>(N, f, data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; +#endif + case 8: + vectorized_elementwise_kernel<8, func_t, array_t> + <<>>(N, f, data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + case 4: + vectorized_elementwise_kernel<4, func_t, array_t> + <<>>(N, f, data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + case 2: + vectorized_elementwise_kernel<2, func_t, array_t> + <<>>(N, f, data); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + case 1: { + auto input_calc = TrivialOffsetCalculator(); + auto output_calc = TrivialOffsetCalculator<1>(); + auto loader = memory::LoadWithoutCast(); + auto storer = memory::StoreWithoutCast(); + int64_t grid_unrolled = (N + elementwise_block_work_size() - 1) / elementwise_block_work_size(); + unrolled_elementwise_kernel + <<>>( + N, f, data, input_calc, output_calc, loader, storer); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + } + default: + TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size"); + } +} + +#ifdef USE_ROCM +template < + int vec_size, + typename func_t, + typename array_t, + typename inp_calc_t, + typename out_calc_t, + typename loader_t, + typename storer_t, + typename OutputType, + typename... InputTypes> +C10_LAUNCH_BOUNDS_1(vectorized_templated_config::num_threads()) +__global__ void vectorized_templated_elementwise_kernel( + int N, + func_t f, + array_t data, + inp_calc_t inp_calc, + out_calc_t out_calc, + loader_t loader, + storer_t storer) { + int remaining = N - + vectorized_templated_config::block_work_size() * + (gridDim.x - blockIdx.x - 1); + constexpr bool reverted_idx = true; + + if (remaining < + vectorized_templated_config::block_work_size()) { // if this block handles + // the reminder, + // just do a naive unrolled loop + auto policy = memory::policies::unroll_base< + vectorized_templated_config::num_threads(), + array_t, + inp_calc_t, + out_calc_t, + loader_t, + storer_t, + vectorized_templated_config::elems_per_thread()>( + data, remaining, inp_calc, out_calc, loader, storer); + elementwise_kernel_helper(f, policy); + } else { // if this block has a full `block_work_size` data to handle, use + // vectorized memory access + auto policy = memory::policies::vectorized_templated< + vec_size, + array_t, + vectorized_templated_config::elems_per_thread(), + vectorized_templated_config::num_threads(), + OutputType, + InputTypes...>(data); + elementwise_kernel_helper(f, policy); + } +} + +// This function assume trivial 1d and supports template specialization +// to avoid dynamic casting. +// Input vectorization size is based on runtime information, i.e. +// the actual data types of the input and output tensor and cannot +// be determined using the functor type, as in regular non-templated +// vectorized kernels. The caller is in charge of selecting the correct input +// vectorization length. +template < + typename func_t, + typename array_t, + typename inp_calc_t, + typename out_calc_t, + typename loader_t, + typename storer_t, + typename OutputType, + typename... InputTypes> +static inline void launch_vectorized_templated_kernel( + int64_t N, + const func_t& f, + array_t data, + inp_calc_t ic, + out_calc_t oc, + loader_t l, + storer_t s) { + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) / + vectorized_templated_config::block_work_size(); + auto stream = at::cuda::getCurrentCUDAStream(); + int vec_size = memory::can_vectorize_up_to(data); + switch (vec_size) { + case 8: + vectorized_templated_elementwise_kernel< + 8, + func_t, + array_t, + inp_calc_t, + out_calc_t, + loader_t, + storer_t, + OutputType, + InputTypes...> + <<>>( + N, f, data, ic, oc, l, s); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + case 4: + vectorized_templated_elementwise_kernel< + 4, + func_t, + array_t, + inp_calc_t, + out_calc_t, + loader_t, + storer_t, + OutputType, + InputTypes...> + <<>>( + N, f, data, ic, oc, l, s); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + case 2: + vectorized_templated_elementwise_kernel< + 2, + func_t, + array_t, + inp_calc_t, + out_calc_t, + loader_t, + storer_t, + OutputType, + InputTypes...> + <<>>( + N, f, data, ic, oc, l, s); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + default: + // vector size 1 is not handled as part of vectorize_templated kernel + TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size"); + } +} +#endif + +template < + typename func_t, + typename array_t, + typename inp_calc_t, + typename out_calc_t, + typename loader_t, + typename storer_t> +static inline void launch_unrolled_kernel( + int64_t N, + const func_t& f, + array_t data, + inp_calc_t ic, + out_calc_t oc, + loader_t l, + storer_t s) { + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + + int64_t grid = (N + elementwise_block_work_size() - 1) / elementwise_block_work_size(); + auto stream = at::cuda::getCurrentCUDAStream(); + unrolled_elementwise_kernel + <<>>(N, f, data, ic, oc, l, s); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +C10_LAUNCH_BOUNDS_2(nt, 4) +__global__ void elementwise_kernel(int N, func_t f) { + int tid = threadIdx.x; + int nv = nt * vt; + int idx = nv * blockIdx.x + tid; +#pragma unroll + for (int i = 0; i < vt; i++) { + if (idx < N) { + f(idx); + idx += nt; + } + } +} + +template +static void launch_legacy_kernel(int64_t N, const func_t& f) { + TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits::max()); + if (N == 0) { + return; + } + dim3 block(nt); + dim3 grid((N + block.x * vt - 1) / (block.x * vt)); + auto stream = at::cuda::getCurrentCUDAStream(); + elementwise_kernel<<>>(N, f); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +#ifdef USE_ROCM +template +C10_LAUNCH_BOUNDS_2(nt, 4) +__global__ void elementwise_kernel_manual_unroll(int N, func_t f) { + int tid = threadIdx.x; + constexpr int nv = nt * vt; + int idx = nv * blockIdx.x + tid; + if ((idx + nt*(vt-1)) < N) { + f(idx, true); + } else { +#pragma unroll + for (int i = 0; i < vt; i++) { + if (idx < N) { + f(idx, false); + idx += nt; + } + } + } +} + +template +static void launch_legacy_kernel_manual_unroll(int64_t N, const func_t& f) { + TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits::max()); + if (N == 0) { + return; + } + dim3 block(nt); + dim3 grid((N + block.x * vt - 1) / (block.x * vt)); + auto stream = at::cuda::getCurrentCUDAStream(); + elementwise_kernel_manual_unroll<<>>(N, f); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} +#endif + +template +C10_HOST_DEVICE typename traits::result_type invoke_impl( + const func_t& f, + char* const C10_RESTRICT data[], + const index_t strides[], + int i, + std::index_sequence) { + (void)strides; + (void)i; + return f(c10::load::type>( + data[INDEX] + i * strides[INDEX])...); +} + +template < + typename func_t, + typename index_t, + typename traits = function_traits> +C10_HOST_DEVICE typename traits::result_type invoke( + const func_t& f, + char* const C10_RESTRICT data[], + const index_t strides[], + int i) { + using Indices = std::make_index_sequence; + return invoke_impl(f, data, strides, i, Indices{}); +} + +template +C10_HOST_DEVICE typename traits::result_type invoke_impl( + const func_t& f, + char* const C10_RESTRICT data[], + const index_t strides[], + const ScalarType dtypes[], + int i, + std::index_sequence) { + (void)strides; + (void)i; + return f(c10::fetch_and_cast::type>( + dtypes[I], data[I] + i * strides[I])...); +} + +template < + typename func_t, + typename index_t, + typename traits = function_traits> +C10_HOST_DEVICE typename traits::result_type invoke( + const func_t& f, + char* const C10_RESTRICT data[], + const index_t strides[], + const ScalarType dtypes[], + int i) { + using Indices = std::make_index_sequence; + return invoke_impl(f, data, strides, dtypes, i, Indices{}); +} + +template +void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) { + using traits = function_traits; + using arg0_t = typename traits::result_type; + constexpr int ntensors = traits::arity + 1; + + TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + std::array data; + for (int i = 0; i < ntensors; i++) { + data[i] = (char*)iter.data_ptr(i); + } + + int64_t numel = iter.numel(); + + bool contiguous = iter.is_contiguous(); + + if (contiguous) { + return launch_vectorized_kernel(numel, f, data); + } + auto offset_calc = ::make_offset_calculator(iter); +#ifndef USE_ROCM + constexpr int unroll_factor = sizeof(arg0_t) >= 4 ? 2 : 4; + launch_legacy_kernel<128, unroll_factor>(numel, [=] GPU_LAMBDA(int idx) { + auto offsets = offset_calc.get(idx); + arg0_t* out = (arg0_t*)(data[0] + offsets[0]); + *out = invoke(f, &data[1], &offsets[1], 1); + }); +#else + constexpr int unroll_factor = sizeof(arg0_t) >= 4 ? 4 : 8; + constexpr int grp_sz = 128; + launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { + if (unrl) { + if constexpr (unroll_factor == 4) { + auto offsets0 = offset_calc.get(idx); + auto offsets1 = offset_calc.get(idx+grp_sz); + auto offsets2 = offset_calc.get(idx+grp_sz*2); + auto offsets3 = offset_calc.get(idx+grp_sz*3); + arg0_t* out0 = (arg0_t*)(data[0] + offsets0[0]); + arg0_t* out1 = (arg0_t*)(data[0] + offsets1[0]); + arg0_t* out2 = (arg0_t*)(data[0] + offsets2[0]); + arg0_t* out3 = (arg0_t*)(data[0] + offsets3[0]); + auto tmp0 = invoke(f, &data[1], &offsets0[1], 1); + auto tmp1 = invoke(f, &data[1], &offsets1[1], 1); + auto tmp2 = invoke(f, &data[1], &offsets2[1], 1); + auto tmp3 = invoke(f, &data[1], &offsets3[1], 1); + *out0 = tmp0; + *out1 = tmp1; + *out2 = tmp2; + *out3 = tmp3; + } else { + auto offsets0 = offset_calc.get(idx); + auto offsets1 = offset_calc.get(idx+grp_sz); + auto offsets2 = offset_calc.get(idx+grp_sz*2); + auto offsets3 = offset_calc.get(idx+grp_sz*3); + auto offsets4 = offset_calc.get(idx+grp_sz*4); + auto offsets5 = offset_calc.get(idx+grp_sz*5); + auto offsets6 = offset_calc.get(idx+grp_sz*6); + auto offsets7 = offset_calc.get(idx+grp_sz*7); + arg0_t* out0 = (arg0_t*)(data[0] + offsets0[0]); + arg0_t* out1 = (arg0_t*)(data[0] + offsets1[0]); + arg0_t* out2 = (arg0_t*)(data[0] + offsets2[0]); + arg0_t* out3 = (arg0_t*)(data[0] + offsets3[0]); + arg0_t* out4 = (arg0_t*)(data[0] + offsets4[0]); + arg0_t* out5 = (arg0_t*)(data[0] + offsets5[0]); + arg0_t* out6 = (arg0_t*)(data[0] + offsets6[0]); + arg0_t* out7 = (arg0_t*)(data[0] + offsets7[0]); + auto tmp0 = invoke(f, &data[1], &offsets0[1], 1); + auto tmp1 = invoke(f, &data[1], &offsets1[1], 1); + auto tmp2 = invoke(f, &data[1], &offsets2[1], 1); + auto tmp3 = invoke(f, &data[1], &offsets3[1], 1); + auto tmp4 = invoke(f, &data[1], &offsets4[1], 1); + auto tmp5 = invoke(f, &data[1], &offsets5[1], 1); + auto tmp6 = invoke(f, &data[1], &offsets6[1], 1); + auto tmp7 = invoke(f, &data[1], &offsets7[1], 1); + *out0 = tmp0; + *out1 = tmp1; + *out2 = tmp2; + *out3 = tmp3; + *out4 = tmp4; + *out5 = tmp5; + *out6 = tmp6; + *out7 = tmp7; + } + } else { + auto offsets = offset_calc.get(idx); + arg0_t* out = (arg0_t*)(data[0] + offsets[0]); + *out = invoke(f, &data[1], &offsets[1], 1); + } + }); +#endif +} + +#ifdef USE_ROCM +namespace { +template < + typename TupleLike, + typename FirstParamTy, + typename SecondParamTy, + size_t arity, + size_t arg_num = 0> +struct check_binary_functor_types_for_specialization { + constexpr static inline bool check() { + if constexpr (arity != 2) + return false; + if constexpr (arg_num == 0) { + using SelectedType = std::tuple_element_t; + if constexpr (std::is_same_v) + return check_binary_functor_types_for_specialization< + TupleLike, + FirstParamTy, + SecondParamTy, + arity, + arg_num + 1>::check(); + } else if constexpr (arg_num == 1) { + using SelectedType2 = std::tuple_element_t; + if constexpr (std::is_same_v) + return check_binary_functor_types_for_specialization< + TupleLike, + FirstParamTy, + SecondParamTy, + arity, + arg_num + 1>::check(); + } + return false; + } +}; + +// Bottom case: if we got this far, assume correct type matching except +// when there are no arguments (arity == 0). +template < + typename TupleLike, + typename FirstParamTy, + typename SecondParamTy, + size_t arity> +struct check_binary_functor_types_for_specialization< + TupleLike, + FirstParamTy, + SecondParamTy, + arity, + arity> { + constexpr static inline bool check() { + if constexpr (arity != 0) + return true; + return false; + } +}; + +template +struct check_binary_functor_types_for_specialization< + TupleLike, + FirstParamTy, + SecondParamTy, + 0, + 0> { + constexpr static inline bool check() { + return false; + } +}; + +// The following is a list of type specializations for vectorized_templated +// elementwise kernel. The three types refer to runtime types of the output +// tensor, first tensor argument, and the second tensor argument used for a +// binary functor. +constexpr std::array rt_binary_specializations = { + std::array( + {c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value}), + std::array( + {c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value}), + std::array( + {c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value}), + std::array( + {c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value}), + std::array( + {c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value}), + std::array( + {c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value, + c10::CppTypeToScalarType::value})}; + +bool check_binary_rt_types_for_specialization(TensorIteratorBase& iter) { + if (iter.ninputs() != 2) + return false; + for (auto spec : rt_binary_specializations) + if (iter.dtype(0) == spec[0] && iter.input_dtype(0) == spec[1] && + iter.input_dtype(1) == spec[2]) + return true; + return false; +} + +template +struct type_specialized_kernel_launcher { + template < + typename func_t, + typename array_t, + typename inp_calc_t, + typename out_calc_t, + typename loader_t, + typename storer_t> + static void apply( + ScalarType ret_t, + ScalarType arg0_t, + ScalarType arg1_t, + int64_t numel, + func_t f, + array_t data, + inp_calc_t input_offset_calculator, + out_calc_t output_offset_calculator, + loader_t loader, + storer_t storer) { + constexpr ScalarType sret_t = rt_binary_specializations[arg_index][0]; + constexpr ScalarType sarg0_t = rt_binary_specializations[arg_index][1]; + constexpr ScalarType sarg1_t = rt_binary_specializations[arg_index][2]; + if (ret_t == sret_t && arg0_t == sarg0_t && arg1_t == sarg1_t) { + using cret_t = c10::impl::ScalarTypeToCPPTypeT; + using carg0_t = c10::impl::ScalarTypeToCPPTypeT; + using carg1_t = c10::impl::ScalarTypeToCPPTypeT; + launch_vectorized_templated_kernel< + func_t, + array_t, + inp_calc_t, + out_calc_t, + loader_t, + storer_t, + cret_t, + carg0_t, + carg1_t>( + numel, + f, + data, + input_offset_calculator, + output_offset_calculator, + loader, + storer); + } + } +}; + +template +struct type_specialized_broadcast_kernel_launcher { + template < + typename func_t, + typename array_t, + typename dtypes_t, + typename calc_t> + static void apply( + int64_t numel, + func_t f, + array_t data, + dtypes_t dtypes, + calc_t offset_calc) { + using traits = function_traits; + using ret_t = typename traits::result_type; + using arg0_t = typename traits::template arg<0>::type; + using arg1_t = typename traits::template arg<1>::type; + if (dtypes[0] == rt_binary_specializations[arg_index][0] && + dtypes[1] == rt_binary_specializations[arg_index][1] && + dtypes[2] == rt_binary_specializations[arg_index][2]) { + using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT; + constexpr int grp_sz = 128; + launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { + if (unrl) { + auto offsets0 = offset_calc.get(idx); + auto offsets1 = offset_calc.get(idx + grp_sz); + auto offsets2 = offset_calc.get(idx + grp_sz * 2); + auto offsets3 = offset_calc.get(idx + grp_sz * 3); + void* out0 = data[0] + offsets0[0]; + void* out1 = data[0] + offsets1[0]; + void* out2 = data[0] + offsets2[0]; + void* out3 = data[0] + offsets3[0]; + auto u = c10::load(data[1] + offsets0[1]); + auto v = c10::load(data[2] + offsets0[2]); + ret_t result0 = f(c10::convert(u), c10::convert(v)); + auto u1 = c10::load(data[1] + offsets1[1]); + auto v1 = c10::load(data[2]+ offsets1[2]); + ret_t result1 = f(c10::convert(u1), c10::convert(v1)); + auto u2 = c10::load(data[1] + offsets2[1]); + auto v2 = c10::load(data[2] + offsets2[2]); + ret_t result2 = f(c10::convert(u2), c10::convert(v2)); + auto u3 = c10::load(data[1] + offsets3[1]); + auto v3 = c10::load(data[2] + offsets3[2]); + ret_t result3 = f(c10::convert(u3), c10::convert(v3)); + *(ret_cpp_t*)out0 = c10::convert(result0); + *(ret_cpp_t*)out1 = c10::convert(result1); + *(ret_cpp_t*)out2 = c10::convert(result2); + *(ret_cpp_t*)out3 = c10::convert(result3); + } else { + auto offsets = offset_calc.get(idx); + void* out = data[0] + offsets[0]; + auto u = c10::load(data[1] + offsets[1]); + auto v = c10::load(data[2] + offsets[2]); + ret_t result = f(c10::convert(u), c10::convert(v)); + *(ret_cpp_t*)out = c10::convert(result); + } + }); + } + } +}; + +} // namespace +#endif + +template +void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { + if (!needs_dynamic_casting::check(iter)) { + return gpu_kernel_impl_nocast(iter, f); + } + using traits = function_traits; + using arg0_t = typename traits::result_type; + constexpr int ntensors = traits::arity + 1; + + TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + + std::array data; + for (int i = 0; i < ntensors; i++) { + data[i] = (char*)iter.data_ptr(i); + } + + int64_t numel = iter.numel(); + + bool contiguous = iter.is_contiguous(); + + if (contiguous) { +#ifdef USE_ROCM + // Attempt to call specialized vectorized elementwise kernel + // that enables interleaving. + if (check_binary_rt_types_for_specialization(iter) && + memory::can_vectorize_up_to(data) > 1) { + // constexpr to reduce the amount of kernels generated for + // vectorized templated elementwise and limit which functors are actually + // applied to the load and store at compile time. + using func_tuple = typename traits::ArgsTuple; + if constexpr ( + std::is_same_v && traits::arity == 2 && + check_binary_functor_types_for_specialization< + func_tuple, + float, + float, + traits::arity, + /*arg_num=*/0>::check()) { + // If we got here, we know we are in one of the specialized cases. We + // need to translate the runtime type to a statically known type. This + // is effectively hoisting to the host the switch over runtime type in + // the kernel in fetch_and_cast. Loader, storer, offset calculators are + // only needed for the reminder loop. + auto input_offset_calculator = TrivialOffsetCalculator(); + auto output_offset_calculator = TrivialOffsetCalculator<1>(); + auto loader = memory::LoadWithCast(iter); + auto storer = memory::StoreWithCast<1>(iter); + memory::detail::static_unroll< + type_specialized_kernel_launcher, + rt_binary_specializations.size()>:: + with_args( + iter.dtype(0), + iter.input_dtype(0), + iter.input_dtype(1), + numel, + f, + data, + input_offset_calculator, + output_offset_calculator, + loader, + storer); + return; + } + } + std::array dtypes; + auto inner_strides = iter.get_inner_strides(); + std::array strides; + for (int i = 0; i < ntensors; i++) { + dtypes[i] = iter.dtype(i); + strides[i] = inner_strides[i]; + } + constexpr int grp_sz = 128; + launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { + if (unrl) { + void* out0 = data[0] + strides[0] * idx; + void* out1 = data[0] + strides[0] * (idx + grp_sz); + void* out2 = data[0] + strides[0] * (idx + grp_sz * 2); + void* out3 = data[0] + strides[0] * (idx + grp_sz * 3); + arg0_t result0 = invoke(f, &data[1], &strides[1], &dtypes[1], idx); + arg0_t result1 = invoke(f, &data[1], &strides[1], &dtypes[1], (idx + grp_sz)); + arg0_t result2 = invoke(f, &data[1], &strides[1], &dtypes[1], (idx + grp_sz * 2)); + arg0_t result3 = invoke(f, &data[1], &strides[1], &dtypes[1], (idx + grp_sz * 3)); + c10::cast_and_store(dtypes[0], out0, result0); + c10::cast_and_store(dtypes[0], out1, result1); + c10::cast_and_store(dtypes[0], out2, result2); + c10::cast_and_store(dtypes[0], out3, result3); + } else { + void* out = data[0] + strides[0] * idx; + arg0_t result = invoke(f, &data[1], &strides[1], &dtypes[1], idx); + c10::cast_and_store(dtypes[0], out, result); + } + }); +#else + auto loader = memory::LoadWithCast(iter); + auto storer = memory::StoreWithCast<1>(iter); + auto input_offset_calculator = TrivialOffsetCalculator(); + auto output_offset_calculator = TrivialOffsetCalculator<1>(); + launch_unrolled_kernel( + numel, + f, + data, + input_offset_calculator, + output_offset_calculator, + loader, + storer); +#endif + } else { + std::array dtypes; + for (int i = 0; i < ntensors; i++) { + dtypes[i] = iter.dtype(i); + } + auto offset_calc = ::make_offset_calculator(iter); +#ifdef USE_ROCM + if (check_binary_rt_types_for_specialization(iter)) { + // constexpr to reduce the amount of kernels generated for + // broadcast elementwise with mexed dtypes and limit which functors are actually + // applied to the load and store at compile time. + using func_tuple = typename traits::ArgsTuple; + if constexpr ( + std::is_same_v && traits::arity == 2 && + check_binary_functor_types_for_specialization< + func_tuple, + float, + float, + traits::arity, + /*arg_num=*/0>::check()) { + memory::detail::static_unroll< + type_specialized_broadcast_kernel_launcher, + rt_binary_specializations.size()>::with_args( + numel, + f, + data, + dtypes, + offset_calc + ); + return; + } + } + + constexpr int grp_sz = 128; + launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { + if (unrl) { + auto offsets0 = offset_calc.get(idx); + auto offsets1 = offset_calc.get(idx + grp_sz); + auto offsets2 = offset_calc.get(idx + grp_sz * 2); + auto offsets3 = offset_calc.get(idx + grp_sz * 3); + void* out0 = data[0] + offsets0[0]; + void* out1 = data[0] + offsets1[0]; + void* out2 = data[0] + offsets2[0]; + void* out3 = data[0] + offsets3[0]; + arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1); + arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1); + arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1); + arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1); + c10::cast_and_store(dtypes[0], out0, result0); + c10::cast_and_store(dtypes[0], out1, result1); + c10::cast_and_store(dtypes[0], out2, result2); + c10::cast_and_store(dtypes[0], out3, result3); + } else { + auto offsets = offset_calc.get(idx); + void* out = data[0] + offsets[0]; + arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); + c10::cast_and_store(dtypes[0], out, result); + } + }); +#else + launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) { + auto offsets = offset_calc.get(idx); + void* out = data[0] + offsets[0]; + arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); + c10::cast_and_store(dtypes[0], out, result); + }); +#endif + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..e149b2d0d0da96e7450baf4429331cf764e8629a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at { namespace native { + +struct TupleInfoCPU { + template + using tuple = thrust::tuple; + + template + static constexpr auto tie(Types&... args) noexcept { + return thrust::tie(args...); + } +}; + +template +using CompositeRandomAccessorCPU = + CompositeRandomAccessor; + +template +void swap( + references_holder rh1, + references_holder rh2 +) { + return thrust::swap(rh1.data(), rh2.data()); +} + +template +auto get(references_holder rh) -> decltype(thrust::get(rh.data())) { + return thrust::get(rh.data()); +} + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Copy.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..c0ed130b8dfabd55d9c3ca47e98b21271bd2a888 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Copy.h @@ -0,0 +1,16 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at { +struct TensorIteratorBase; + +namespace native { + +void direct_copy_kernel_cuda(TensorIteratorBase& iter); + +} +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h new file mode 100644 index 0000000000000000000000000000000000000000..cfdf448927baab4a590902f06867a79cc18d52ba --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h @@ -0,0 +1,499 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at::native::detail { + +// Enum representing the FFT type +enum class CuFFTTransformType : int8_t { + C2C, // Complex-to-complex + R2C, // Real-to-complex + C2R, // Complex-to-real +}; + +// This struct is used to let us easily compute hashes of the +// parameters. +// It will be the **key** to the plan cache. +struct CuFFTParams +{ + int64_t signal_ndim_; // between 1 and max_rank, i.e., 1 <= signal_ndim <= 3 + // These include additional batch dimension as well. + int64_t sizes_[max_rank + 1]; + int64_t input_strides_[max_rank + 1]; + int64_t output_strides_[max_rank + 1]; + CuFFTTransformType fft_type_; + ScalarType value_type_; + + CuFFTParams() = default; + + CuFFTParams(IntArrayRef in_strides, IntArrayRef out_strides, + IntArrayRef signal_sizes, CuFFTTransformType fft_type, ScalarType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_sizes.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + + TORCH_INTERNAL_ASSERT(in_strides.size() == signal_sizes.size()); + TORCH_INTERNAL_ASSERT(out_strides.size() == signal_sizes.size()); + TORCH_INTERNAL_ASSERT(1 <= signal_ndim_ && signal_ndim_ <= max_rank); + + std::copy(signal_sizes.cbegin(), signal_sizes.cend(), sizes_); + std::copy(in_strides.cbegin(), in_strides.cend(), input_strides_); + std::copy(out_strides.cbegin(), out_strides.cend(), output_strides_); + } +}; + +static_assert(std::is_trivial_v ); + +// Returns true if the transform type has complex input +inline bool cufft_complex_input(CuFFTTransformType type) { + switch (type) { + case CuFFTTransformType::C2C: + case CuFFTTransformType::C2R: + return true; + + case CuFFTTransformType::R2C: + return false; + } + TORCH_INTERNAL_ASSERT(false); +} + +// Returns true if the transform type has complex output +inline bool cufft_complex_output(CuFFTTransformType type) { + switch (type) { + case CuFFTTransformType::C2C: + case CuFFTTransformType::R2C: + return true; + + case CuFFTTransformType::C2R: + return false; + } + TORCH_INTERNAL_ASSERT(false); +} + +// Create transform type enum from bools representing if input and output are complex +inline CuFFTTransformType GetCuFFTTransformType(bool complex_input, bool complex_output) { + if (complex_input && complex_output) { + return CuFFTTransformType::C2C; + } else if (complex_input && !complex_output) { + return CuFFTTransformType::C2R; + } else if (!complex_input && complex_output) { + return CuFFTTransformType::R2C; + } + TORCH_INTERNAL_ASSERT(false, "Real to real FFTs are not supported"); +} + + +class CuFFTHandle { + ::cufftHandle handle_; +public: + + CuFFTHandle() { + CUFFT_CHECK(cufftCreate(&handle_)); + } + + ::cufftHandle & get() { return handle_; } + const ::cufftHandle & get() const { return handle_; } + + ~CuFFTHandle() { +// Not using fftDestroy() for rocFFT to work around double freeing of handles +#if !defined(USE_ROCM) + cufftDestroy(handle_); +#endif + } +}; + +__forceinline__ +static bool is_pow_of_two(int64_t x) { + return (x & (x - 1)) == 0; +} + +using cufft_size_type = long long int; + +using CuFFTDimVector = c10::SmallVector; + +// Struct representing a tensor in CuFFT's data layout for planning transforms +// See NOTE [ cuFFT Embedded Strides ]. +struct CuFFTDataLayout { + CuFFTDimVector embed; + cufft_size_type stride, dist; + bool must_clone, simple; +}; + +// Returns a cufft embedding for a contiguous signal of the given size. +// e.g. if the input is cloned, this will be the resulting data layout +// See NOTE [ cuFFT Embedded Strides ]. +inline CuFFTDataLayout cufft_simple_embed(IntArrayRef sizes, bool onesided) { + CuFFTDataLayout layout; + layout.simple = true; + layout.must_clone = false; + layout.embed.assign(sizes.cbegin() + 1, sizes.cend()); + if (onesided) { + layout.embed.back() = sizes.back() / 2 + 1; + } + layout.stride = 1; + layout.dist = 1; + for (const auto& len : layout.embed) { + layout.dist *= len; + } + return layout; +} + +// Convert strides to a CuFFT embedded representation. +// If strides cannot be embedded, returns a simple layout and sets must_clone flag +// See NOTE [ cuFFT Embedded Strides ]. +inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bool onesided) { + const auto signal_ndim = strides.size() - 1; + CuFFTDataLayout layout; + auto last_stride = strides[signal_ndim]; + layout.must_clone = (last_stride <= 0); + + const auto last_dim_size = onesided ? + sizes[signal_ndim] / 2 + 1 : sizes[signal_ndim]; + const auto signal_numel = c10::multiply_integers(sizes.slice(1, sizes.size() - 2)) * last_dim_size; + + // Zero stides are not allowed, even if the batch size is one. + // If that happens just set a dummy case + if (sizes[0] == 1) { + layout.dist = signal_numel; + } else if (strides[0] == 0) { + layout.must_clone = true; + } else { + layout.dist = strides[0]; + } + + // Calculate the embedding shape, or set must_clone if the strides cannot be embedded + layout.embed.resize(signal_ndim); + for (auto i = signal_ndim - 1; !layout.must_clone && i > 0; i--) { + auto stride = strides[i]; + if (sizes[i] == 1) { + layout.embed[i] = 1; + } else if (stride > 0 && stride % last_stride == 0) { + layout.embed[i] = stride / last_stride; + last_stride = stride; + } else { + layout.must_clone = true; + } + } + + if (layout.must_clone) { + // If the input needs to be cloned, assume it will be contiguous + layout = cufft_simple_embed(sizes, onesided); + layout.must_clone = true; + } else { + layout.embed[0] = sizes[1]; + layout.stride = strides[signal_ndim]; + // Determine if layout represents a simple embedding (contiguous data) + layout.simple = [&] { + for (const auto i : c10::irange(1, signal_ndim - 1)) { + if (layout.embed[i] != sizes[i + 1]) { + return false; + } + } + + return (layout.stride == 1 && layout.dist == signal_numel && + layout.embed.back() == last_dim_size); + }(); + } + return layout; +} + +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. whether to clone input before executing the plan +// 3. the workspace size needed +// +// This class will be the **value** in the plan cache. +// It **owns** the raw plan via a unique_ptr. +class CuFFTConfig { +public: + + // Only move semantics is enough for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + CuFFTConfig(const CuFFTConfig&) = delete; + CuFFTConfig& operator=(CuFFTConfig const&) = delete; + + explicit CuFFTConfig(const CuFFTParams& params): + CuFFTConfig( + IntArrayRef(params.input_strides_, params.signal_ndim_ + 1), + IntArrayRef(params.output_strides_, params.signal_ndim_ + 1), + IntArrayRef(params.sizes_, params.signal_ndim_ + 1), + params.fft_type_, + params.value_type_) {} + + // For complex types, strides are in units of 2 * element_size(dtype) + // sizes are for the full signal, including batch size and always two-sided + CuFFTConfig(IntArrayRef in_strides, IntArrayRef out_strides, + IntArrayRef sizes, CuFFTTransformType fft_type, ScalarType dtype): + fft_type_(fft_type), value_type_(dtype) { + + // signal sizes (excluding batch dim) + CuFFTDimVector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const int64_t batch = sizes[0]; + const int64_t signal_ndim = sizes.size() - 1; + + // Since cuFFT has limited non-unit stride support and various constraints, we + // use a flag to keep track throughout this function to see if we need to + // input = input.clone(); + +#if defined(USE_ROCM) + // clone input to avoid issues with hipfft clobering the input and failing tests + clone_input = true; +#else + clone_input = false; +#endif + + // For half, base strides on the real part of real-to-complex and + // complex-to-real transforms are not supported. Since our output is always + // contiguous, only need to check real-to-complex case. + if (dtype == ScalarType::Half) { + // cuFFT on half requires compute capability of at least SM_53 + auto dev_prop = at::cuda::getCurrentDeviceProperties(); + TORCH_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3), + "cuFFT doesn't support signals of half type with compute " + "capability less than SM_53, but the device containing input half " + "tensor only has SM_", dev_prop->major, dev_prop->minor); + for (const auto i : c10::irange(signal_ndim)) { + TORCH_CHECK(is_pow_of_two(sizes[i + 1]), + "cuFFT only supports dimensions whose sizes are powers of two when" + " computing in half precision, but got a signal size of", + sizes.slice(1)); + } + clone_input |= in_strides.back() != 1; + } + + CuFFTDataLayout in_layout; + if (clone_input) { + in_layout = cufft_simple_embed(sizes, fft_type == CuFFTTransformType::C2R); + } else { + in_layout = as_cufft_embed(in_strides, sizes, fft_type == CuFFTTransformType::C2R); + } + auto out_layout = as_cufft_embed(out_strides, sizes, fft_type == CuFFTTransformType::R2C); + TORCH_INTERNAL_ASSERT(!out_layout.must_clone, "Out strides cannot be represented as CuFFT embedding"); + clone_input |= in_layout.must_clone; + + // Check if we can take advantage of simple data layout. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + + const bool simple_layout = in_layout.simple && out_layout.simple; + cudaDataType itype, otype, exec_type; + const auto complex_input = cufft_complex_input(fft_type); + const auto complex_output = cufft_complex_output(fft_type); + if (dtype == ScalarType::Float) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (dtype == ScalarType::Double) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else if (dtype == ScalarType::Half) { + itype = complex_input ? CUDA_C_16F : CUDA_R_16F; + otype = complex_output ? CUDA_C_16F : CUDA_R_16F; + exec_type = CUDA_C_16F; + } else { + TORCH_CHECK(false, "cuFFT doesn't support tensor of type: ", dtype); + } + + // disable auto allocation of workspace to use THC allocator + CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + // make plan + if (simple_layout) { + // If with unit-stride, we tell cuFFT by setting inembed == onembed == NULL. + // In such case, cuFFT ignores istride, ostride, idist, and odist + // by assuming istride = ostride = 1. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, + batch, &ws_size_t, exec_type)); + } else { + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + in_layout.embed.data(), in_layout.stride, in_layout.dist, itype, + out_layout.embed.data(), out_layout.stride, out_layout.dist, otype, + batch, &ws_size_t, exec_type)); + } + ws_size = static_cast(ws_size_t); + } + + const cufftHandle &plan() const { return plan_ptr.get(); } + + CuFFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + bool should_clone_input() const { return clone_input; } + int64_t workspace_size() const { return ws_size; } + +private: + CuFFTHandle plan_ptr; + bool clone_input; + int64_t ws_size; + CuFFTTransformType fft_type_; + ScalarType value_type_; +}; + +#if defined(USE_ROCM) + // Note that the max plan number for CUDA version < 10 has to be 1023 + // due to a bug that fails on the 1024th plan + constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023; + constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; +#else + constexpr int64_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); + // The default max cache size chosen for CUDA version > 10 is arbitrary. + // This number puts a limit on how big of a plan cache should we maintain by + // default. Users can always configure it via cufft_set_plan_cache_max_size. + constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = 4096; +#endif +static_assert(0 <= CUFFT_MAX_PLAN_NUM && CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), + "CUFFT_MAX_PLAN_NUM not in size_t range"); +static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, + "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); + +// This cache assumes that the mapping from key to value never changes. +// This is **NOT** thread-safe. Please use a mutex when using it **AND** the +// value returned from try_emplace_value. +// The contract of using this cache is that try_emplace_value should only be +// used when the max_size is positive. +class CuFFTParamsLRUCache { +public: + using kv_t = typename std::pair; + using map_t = typename std::unordered_map, + typename std::list::iterator, + ParamsHash, + ParamsEqual>; + using map_kkv_iter_t = typename map_t::iterator; + + + CuFFTParamsLRUCache() : CuFFTParamsLRUCache(CUFFT_DEFAULT_CACHE_SIZE) {} + + CuFFTParamsLRUCache(int64_t max_size) { + _set_max_size(max_size); + } + + CuFFTParamsLRUCache(CuFFTParamsLRUCache&& other) noexcept : + _usage_list(std::move(other._usage_list)), + _cache_map(std::move(other._cache_map)), + _max_size(other._max_size) {} + + CuFFTParamsLRUCache& operator=(CuFFTParamsLRUCache&& other) noexcept { + _usage_list = std::move(other._usage_list); + _cache_map = std::move(other._cache_map); + _max_size = other._max_size; + return *this; + } + + // If key is in this cache, return the cached config. Otherwise, emplace the + // config in this cache and return it. + // Return const reference because CuFFTConfig shouldn't be tampered with once + // created. + const CuFFTConfig &lookup(CuFFTParams params) { + AT_ASSERT(_max_size > 0); + + map_kkv_iter_t map_it = _cache_map.find(params); + // Hit, put to list front + if (map_it != _cache_map.end()) { + _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); + return map_it->second->second; + } + + // Miss + // remove if needed + if (_usage_list.size() >= _max_size) { + auto last = _usage_list.end(); + last--; + _cache_map.erase(last->first); + _usage_list.pop_back(); + } + + // construct new plan at list front, then insert into _cache_map + _usage_list.emplace_front(std::piecewise_construct, + std::forward_as_tuple(params), + std::forward_as_tuple(params)); + auto kv_it = _usage_list.begin(); + _cache_map.emplace(std::piecewise_construct, + std::forward_as_tuple(kv_it->first), + std::forward_as_tuple(kv_it)); + return kv_it->second; + } + + void clear() { + _cache_map.clear(); + _usage_list.clear(); + } + + void resize(int64_t new_size) { + _set_max_size(new_size); + auto cur_size = _usage_list.size(); + if (cur_size > _max_size) { + auto delete_it = _usage_list.end(); + for (size_t i = 0; i < cur_size - _max_size; i++) { + delete_it--; + _cache_map.erase(delete_it->first); + } + _usage_list.erase(delete_it, _usage_list.end()); + } + } + + size_t size() const { return _cache_map.size(); } + + size_t max_size() const noexcept { return _max_size; } + + std::mutex mutex; + +private: + // Only sets size and does value check. Does not resize the data structures. + void _set_max_size(int64_t new_size) { + // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since + // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check + // first. + TORCH_CHECK(new_size >= 0, + "cuFFT plan cache size must be non-negative, but got ", new_size); + TORCH_CHECK(new_size <= CUFFT_MAX_PLAN_NUM, + "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size); + _max_size = static_cast(new_size); + } + + std::list _usage_list; + map_t _cache_map; + size_t _max_size; +}; + +// Since ATen is separated into CPU build and CUDA build, we need a way to call +// these functions only when CUDA is loaded. We use CUDA hooks for this purpose +// (at cuda/detail/CUDAHooks.cpp), and call the hooked functions from the actual +// native function counterparts (at native/SpectralOps.cpp), i.e., +// _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size +// _cufft_get_plan_cache_size, and _cufft_clear_plan_cache. +int64_t cufft_get_plan_cache_max_size_impl(DeviceIndex device_index); +void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_size); +int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index); +void cufft_clear_plan_cache_impl(DeviceIndex device_index); + +} // namespace at::native::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..b93f3195de0c88f9c118b3606976530b0e3c6794 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTUtils.h @@ -0,0 +1,80 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace at { namespace native { + +// This means that max dim is 3 + 2 = 5 with batch dimension and possible +// complex dimension +constexpr int max_rank = 3; + +static inline std::string _cudaGetErrorEnum(cufftResult error) +{ + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; +#if CUDA_VERSION <= 12090 + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; +#endif +#if !defined(USE_ROCM) && CUDA_VERSION <= 12090 + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; +#endif + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + default: + std::ostringstream ss; + ss << "unknown error " << error; + return ss.str(); + } +} + +static inline void CUFFT_CHECK(cufftResult error) +{ + if (error != CUFFT_SUCCESS) { + std::ostringstream ss; + ss << "cuFFT error: " << _cudaGetErrorEnum(error); + TORCH_CHECK(false, ss.str()); + } +} + +}} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh new file mode 100644 index 0000000000000000000000000000000000000000..396cd1e05b434a6bf50b1e559a644fb66403cc0d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at::native { +#if defined(USE_ROCM) +// take these out when ROCm implements std:: math functions +#include +template +static __forceinline__ __device__ scalar_t device_sqrt(scalar_t val); + +template <> +__forceinline__ __device__ float device_sqrt(float val) { + return ::sqrtf(val); +} + +template <> +__forceinline__ __device__ double device_sqrt(double val) { + return ::sqrt(val); +} +#else +template +__forceinline__ __device__ double device_sqrt(scalar_t val) { + return std::sqrt(val); +} +#endif +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h new file mode 100644 index 0000000000000000000000000000000000000000..618af6054a30989438f4d265e4ffc587420fca08 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h @@ -0,0 +1,702 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace { + +// launch bounds used for kernels utilizing TensorIterator +const uint32_t block_size_bound = 256; +const uint32_t grid_size_bound = 4; +// At the time of writing, there is no curand_* call that increments the offset by more than 4. +// See: https://docs.nvidia.com/cuda/archive/11.8.0/curand/group__DEVICE.html +const uint32_t max_generator_offsets_per_curand_call = 4; + +// utility function that calculates proper philox_offset +// for distributions utilizing TensorIterator. For distributions using +// TensorIterator, we are using a grid-stride loop with each +// thread yielding one element per thread. For the edge of the grid-stride +// loop, if the tensor size is large, the unroll loop will kick in and the float4 +// from curand4 will start getting utilized (for common tensor sizes, we end up +// using rand.x from each thread). The philox_offset calculation was changed to +// (number of elements per thread * maximum generator increment per "curand_*" call), which makes +// sure that philox offset increment is not less than the number of randoms used +// in each thread. +std::tuple calc_execution_policy(const int64_t total_elements, const uint32_t unroll_factor) { + const uint64_t numel = static_cast(total_elements); + const uint32_t block_size = block_size_bound; + dim3 dim_block(block_size); + dim3 grid((numel + block_size - 1) / block_size); + uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size; + grid.x = std::min( + static_cast(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm, + grid.x); + //number of times random will be generated per thread, to offset philox counter in thc random state + uint64_t counter_offset = ((numel - 1) / (block_size * grid.x * unroll_factor) + 1) * max_generator_offsets_per_curand_call; + return std::make_tuple(counter_offset, grid, dim_block); +} + +// grid stride loop kernel for distributions +template +C10_LAUNCH_BOUNDS_2(block_size_bound, grid_size_bound) +__global__ void distribution_elementwise_grid_stride_kernel(int64_t numel, + PhiloxCudaState philox_args, + const dist_t dist_func, + const transform_t transform_func) { + auto [seed, offset] = at::cuda::philox::unpack(philox_args); + int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x; + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, offset, &state); + + int64_t rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) * + blockDim.x * gridDim.x * unroll_factor; + for(int64_t linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) { + auto rand = dist_func(&state); + #pragma unroll + for (int ii = 0; ii < unroll_factor; ii++) { + int64_t li = linear_index + blockDim.x * gridDim.x * ii; + if (li < numel) { + transform_func(li, static_cast((&rand.x)[ii])); + } + } + __syncthreads(); + } +} + +/** + * distribution_nullary_kernel is analogous to gpu_kernel in + * ATen/native/cuda/Loops.cuh. Like gpu_kernel, it uses + * TensorIterator to launch a kernel. However, the differences are + * - it launches a grid-stride loop based kernel. The kernel is not + * generic like elementwise_kernel in Loops.cuh and is specialized + * for the distribution kernels here. + * - For big size tensors, we can launch multiple kernels recursively + * (i.e. if (!iter.can_use_32bit_indexing())) and hence, the philox + * offset calculation is done in this function. + * + * FIXME: Can we specialize elementwise_kernel and launch_kernel in Loops.cuh + * to have grid-stride loop kernel and then use that to launch our distribution + * kernels? Note that we need a grid-stride loop kernel because, we found by testing + * that it achieves peak effective bandwidth. + */ +template +void distribution_nullary_kernel(at::TensorIteratorBase& iter, + RNG gen, + const dist_t& dist_func, + const transform_t transform_func) { + const int unroll_factor = sizeof(dist_func_return_t) / sizeof(accscalar_t); + TORCH_CHECK(unroll_factor >= 1, "unroll_factor must be >= 1."); + int64_t numel = iter.numel(); + if (numel == 0) { + return; + } + + auto [counter_offset, grid, block] = calc_execution_policy(numel, unroll_factor); + PhiloxCudaState rng_engine_inputs; + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + rng_engine_inputs = gen->philox_cuda_state(counter_offset); + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + distribution_nullary_kernel(sub_iter, + gen, dist_func, transform_func); + } + return; + } + + char* out_data = (char*)iter.data_ptr(0); + + auto stream = at::cuda::getCurrentCUDAStream(); + if (iter.is_trivial_1d()) { + auto strides = iter.get_inner_strides(); + int stride0 = strides[0]; + distribution_elementwise_grid_stride_kernel<<>>( + numel, + rng_engine_inputs, + dist_func, + [=]__device__(int idx, accscalar_t rand) { + scalar_t* out = (scalar_t*)&out_data[stride0 * idx]; + *out = transform_func(rand); + } + ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + auto offset_calc = make_offset_calculator<1>(iter); + distribution_elementwise_grid_stride_kernel<<>>( + numel, + rng_engine_inputs, + dist_func, + [=]__device__(int idx, accscalar_t rand) { + auto offsets = offset_calc.get(idx); + scalar_t* out = (scalar_t*)&out_data[offsets[0]]; + *out = transform_func(rand); + } + ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +// Binary kernel +template +__global__ void distribution_binary_elementwise_kernel( + int numel, + func_t f, + PhiloxCudaState philox_args, + typename function_traits::result_type *output_data, + const typename function_traits::template arg<1>::type *input_data_1, + const typename function_traits::template arg<2>::type *input_data_2, + inp_offset_calc_t inp_calc, + out_offset_calc_t out_calc) { + auto seeds = at::cuda::philox::unpack(philox_args); + + using input_t_1 = typename function_traits::template arg<1>::type; + using input_t_2 = typename function_traits::template arg<2>::type; + + input_t_1 inputs_1[thread_work_size()]; + input_t_2 inputs_2[thread_work_size()]; + + int base_index = block_work_size() * blockIdx.x; + int remaining = std::min(numel - base_index, block_work_size()); + + curandStatePhilox4_32_10_t state; + curand_init(std::get<0>(seeds), + blockIdx.x * blockDim.x + threadIdx.x, + std::get<1>(seeds), + &state); + + // load data into registers + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + break; + } + int input_idx = thread_idx + base_index; + auto offsets = inp_calc.get(input_idx); + inputs_1[i] = input_data_1[offsets[0]]; + inputs_2[i] = input_data_2[offsets[1]]; + + thread_idx += num_threads(); + } + + // compute and store + thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + break; + } + int input_idx = thread_idx + base_index; + auto offsets = out_calc.get(input_idx); + output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]); + thread_idx += num_threads(); + } +} + +template +void distribution_binary_kernel(TensorIteratorBase &iter, PhiloxCudaState philox_args, const func_t &f) { + static_assert(std::is_same_v::template arg<0>::type, curandStatePhilox4_32_10_t&>, "the first argument of functor must be curandStatePhilox4_32_10_t"); + using input_t_1 = typename function_traits::template arg<1>::type; + using input_t_2 = typename function_traits::template arg<2>::type; + using output_t = typename function_traits::result_type; + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + distribution_binary_kernel(sub_iter, philox_args, f); + } + return; + } + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(iter.can_use_32bit_indexing()); + + int64_t numel = iter.numel(); + if (numel == 0) { + return; + } + + output_t *output_data = static_cast(iter.data_ptr(0)); + const input_t_1 *input_data_1 = static_cast(iter.data_ptr(1)); + const input_t_2 *input_data_2 = static_cast(iter.data_ptr(2)); + + int64_t grid = (numel + block_work_size() - 1) / block_work_size(); + auto stream = at::cuda::getCurrentCUDAStream(); + + if (iter.is_contiguous()) { + distribution_binary_elementwise_kernel<<>>( + numel, f, philox_args, output_data, input_data_1, input_data_2, + TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + distribution_binary_elementwise_kernel<<>>( + numel, f, philox_args, output_data, input_data_1, input_data_2, + make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter)); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +} // namespace +}} // namespace at::native + + +namespace at { +namespace native { +namespace templates { +namespace cuda { + +// ==================================================== Random ======================================================== + +template +void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) { +#ifdef FBCODE_CAFFE2 + AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] { + if (( + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) && range >= 1ULL << 32) + { + // define lambda to mod with range and add base + auto random_func = [range, base] __device__ (uint64_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + auto random_func = [range, base] __device__ (uint32_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> uint4 { + return curand4(state); + }, + random_func); + } + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); +#else + AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] { + if (range >= 1ULL << 28) // allow approx 5% skew in uniform int generation using % + { + // define lambda to mod with range and add base + auto random_func = [range, base] __device__ (uint64_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + auto random_func = [range, base] __device__ (uint32_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> uint4 { + return curand4(state); + }, + random_func); + } + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); +#endif +} + +// This is the special kernel to handle single specific case: +// from(inclusive) = std::numeric_limits::lowest() +// to(exclusive) = None (= std::numeric_limits::max() + 1) +template +void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cuda", [&] { + if (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + auto random_func = [] __device__ (uint64_t rand) { + return transformation::uniform_int_full_range(rand); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + TORCH_CHECK(false, "random_full_64_bits_range_kernel_cuda handles only int64, double, float and bfloat16"); + } + }); +} + +template +struct RandomFromToKernel { + void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { + random_from_to_kernel(iter, range, base, check_generator(gen)); + } + void operator()(TensorIteratorBase& iter, std::optional gen) { + random_full_64_bits_range_kernel(iter, check_generator(gen)); + } +}; + +template +void random_kernel(TensorIteratorBase& iter, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cuda", [&] { + if (std::is_same_v || std::is_same_v) { + auto random_func = [] __device__ (uint64_t rand) { + return transformation::uniform_int(rand); + }; + distribution_nullary_kernel(iter, gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + auto random_func = [] __device__ (uint32_t rand) { + return transformation::uniform_int(rand); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> uint4 { + return curand4(state); + }, + random_func); + } + }); +} + +template +struct RandomKernel { + void operator()(TensorIteratorBase& iter, RNG gen) { + random_kernel(iter, gen); + } +}; + +// ==================================================================================================================== + +template +void uniform_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) { + if (std::is_same_v) { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> double2 { return curand_uniform2_double(state); }, + transform); + } else { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> float4 { return curand_uniform4(state); }, + transform); + } +} + +template +void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) { + if (std::is_same_v) { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> double2 { return curand_normal2_double(state); }, + transform); + } else { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> float4 { return curand_normal4(state); }, + transform); + } +} + +// ==================================================== Normal ======================================================== + +template +void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) { + auto iter = TensorIterator::borrowing_nullary_op(self); + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] { + using accscalar_t = at::acc_type; + auto mean = static_cast(mean_); + auto std = static_cast(std_); + // define lambda to multiply std and add mean + auto normal_func = [mean, std] __device__ (accscalar_t rand) { + return static_cast(transformation::normal(rand, mean, std)); + }; + normal_and_transform(iter, gen, normal_func); + }); +} + +template +struct NormalKernel { + void operator()(const TensorBase &self, double mean, double std, std::optional gen) { + normal_kernel(self, mean, std, check_generator(gen)); + } +}; + +// ==================================================== Uniform ======================================================== + +template +void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "uniform_kernel_cuda", [&] { + auto from = static_cast(from_); + auto to = static_cast(to_); + using opmath_t = at::opmath_type; + auto range = static_cast(to-from); + // define lambda to reverse bounds, multiply 'range' and add 'from_' + auto uniform_func = [range, from, to] __device__ (opmath_t rand) { + // Compute output value before reversing the bounds + // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/96947 + auto value = static_cast(rand * range + from); + // reverse the bounds of curand4 from (0, 1] to [0, 1) + // Note that this method is from legacy THCTensorRandom and is likely to give + // you more 0-s, since, the probability of getting 1-s is higher than 0-s and + // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s. + // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706 + auto reverse_bound_value = value == to ? from : value; + return reverse_bound_value; + }; + uniform_and_transform(iter, gen, uniform_func); + }); +} + +template +struct UniformKernel { + void operator()(TensorIteratorBase& iter, double from, double to, std::optional gen) { + uniform_kernel(iter, from, to, check_generator(gen)); + } +}; + +// ================================================== LogNormal ======================================================= + +template +void log_normal_kernel(TensorIteratorBase& iter, double mean_, double std_, RNG gen) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cuda", [&] { + using accscalar_t = at::acc_type; + auto mean = static_cast(mean_); + auto std = static_cast(std_); + // define lambda for log_normal transformation + auto log_normal_func = [mean, std] __device__ (accscalar_t rand) { + return static_cast(transformation::log_normal(transformation::normal(rand, mean, std))); + }; + normal_and_transform(iter, gen, log_normal_func); + }); +} + +template +struct LogNormalKernel { + void operator()(TensorIteratorBase& iter, double mean, double std, std::optional gen) { + log_normal_kernel(iter, mean, std, check_generator(gen)); + } +}; + +// =================================================== Geometric ====================================================== + +template +void geometric_kernel(TensorIteratorBase& iter, double p, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cuda", [&] { + using accscalar_t = at::DiscreteDistributionType::type; + // define lambda for geometric transformation + auto geometric_func = [p] __device__ (accscalar_t rand) { + return static_cast(transformation::geometric(rand, p)); + }; + uniform_and_transform(iter, gen, geometric_func); + }); +} + +template +struct GeometricKernel { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { + geometric_kernel(iter, p, check_generator(gen)); + } +}; + +// ================================================== Exponential ===================================================== + +template +void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) { + TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype()); + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cuda", [&] { + using accscalar_t = at::acc_type; + auto lambda = static_cast(lambda_); + // define lambda for exponential transformation + auto exponential_func = [lambda] __device__ (accscalar_t rand) { + return static_cast(transformation::exponential(rand, lambda)); + }; + uniform_and_transform(iter, gen, exponential_func); + }); +} + +template +struct ExponentialKernel { + void operator()(TensorIteratorBase& iter, double lambda, std::optional gen) { + exponential_kernel(iter, lambda, check_generator(gen)); + } +}; + +// ==================================================== Cauchy ======================================================== + +template +void cauchy_kernel(TensorIteratorBase& iter, double median_, double sigma_, RNG gen) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "cauchy_cuda", [&] { + using accscalar_t = at::acc_type; + auto median = static_cast(median_); + auto sigma = static_cast(sigma_); + // define lambda for cauchy transformation + auto cauchy_func = [median, sigma] __device__ (accscalar_t rand) { + return static_cast(transformation::cauchy(rand, median, sigma)); + }; + uniform_and_transform(iter, gen, cauchy_func); + }); +} + +template +struct CauchyKernel { + void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { + cauchy_kernel(iter, median, sigma, check_generator(gen)); + } +}; + +// ==================================================== Bernoulli ===================================================== + +template +void bernoulli_tensor_cuda_kernel( + const TensorBase &ret, const at::TensorBase &p, + PhiloxCudaState philox_args) { + auto functor = [philox_args] __device__( + int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4, + const prob_t& p1, const prob_t& p2, const prob_t& p3, const prob_t& p4) { + auto seeds = at::cuda::philox::unpack(philox_args); + curandStatePhilox4_32_10_t state; + curand_init(std::get<0>(seeds), + blockIdx.x * blockDim.x + threadIdx.x, + std::get<1>(seeds), + &state); + + // See Note [Register spilling in curand call for CUDA < 10] + float4 rand = curand_uniform4(&state); + switch (n) { + case 4: { + CUDA_KERNEL_ASSERT(0 <= p4 && p4 <= 1); + v4 = static_cast(rand.w <= p4); + [[fallthrough]]; + } + case 3: { + CUDA_KERNEL_ASSERT(0 <= p3 && p3 <= 1); + v3 = static_cast(rand.z <= p3); + [[fallthrough]]; + } + case 2: { + CUDA_KERNEL_ASSERT(0 <= p2 && p2 <= 1); + v2 = static_cast(rand.y <= p2); + [[fallthrough]]; + } + case 1: { + CUDA_KERNEL_ASSERT(0 <= p1 && p1 <= 1); + v1 = static_cast(rand.x <= p1); + } + } + }; + // The template argument `4` below indicates that we want to operate on four + // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details. + at::cuda::CUDA_tensor_apply2(ret, p, functor); +} + +template +void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) { + PhiloxCudaState rng_engine_inputs; + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + rng_engine_inputs = gen->philox_cuda_state(10); + } + TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type()); + // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else + const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat; + auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type)); + auto p = expand_inplace(self, p_cuda); + AT_DISPATCH_ALL_TYPES_AND3( + at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] { + if (std::is_same_v) { + return bernoulli_tensor_cuda_kernel(self, *p, rng_engine_inputs); + } else { + return bernoulli_tensor_cuda_kernel(self, *p, rng_engine_inputs); + } + }); +} + +template +void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND3( + at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "bernoulli_scalar_cuda_", [&] { + using accscalar_t = at::DiscreteDistributionType::type; + // define lambda for bernoulli transformation + auto bernoulli_func = [p] __device__ (accscalar_t rand) { + return static_cast(transformation::bernoulli(rand, p)); + }; + uniform_and_transform(iter, gen, bernoulli_func); + }); +} + +template +struct BernoulliKernel { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { + bernoulli_kernel(iter, p, check_generator(gen)); + } + void operator()(const TensorBase &self, const TensorBase &p_, std::optional gen) { + bernoulli_kernel(self, p_, check_generator(gen)); + } +}; + +}}}} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Distributions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Distributions.h new file mode 100644 index 0000000000000000000000000000000000000000..11deb513c87a8beb6e44a7ff550b4c39918ecc51 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Distributions.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at { +struct CUDAGeneratorImpl; +struct TensorIteratorBase; +class TensorBase; + +namespace native { + +void launch_poisson_cuda_kernel( + const TensorBase &ret, const TensorBase &lambda, CUDAGeneratorImpl *gen); + +void launch_gamma_kernel( + const TensorBase &ret, const TensorBase &alpha, CUDAGeneratorImpl *gen); + +void launch_binomial_cuda_kernel( + TensorIteratorBase &iter, CUDAGeneratorImpl *gen); + +void launch_dirichlet_kernel(TensorIteratorBase &iter); + +void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter); + +void launch_dirichlet_grad_kernel(TensorIteratorBase &iter); + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..d155eb420ebbfdff544f613ca3d7f3dc5bd58d88 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +namespace at::native { + +Tensor embedding_backward_cuda_kernel( + const Tensor &grad, + const Tensor &orig_indices, + const Tensor &sorted_indices, + const Tensor &count, + int64_t num_weights, + int padding_idx = -1, + bool mode_mean = false, + const Tensor &offset2bag = Tensor(), + const Tensor &bag_size = Tensor(), + const Tensor &per_sample_weights = Tensor()); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh new file mode 100644 index 0000000000000000000000000000000000000000..461ea60b8fda7b89edaa0d04c39f4a9c046bcb87 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh @@ -0,0 +1,743 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +namespace at::native { + +namespace { + +// TODO(crcrpar): Handle version bump in codegen. +// rel: +// https://github.com/pytorch/pytorch/blob/9cf84347767c8abb8feba18a9a1baba321eeb8b9/tools/autograd/gen_inplace_or_view_type.py#L481-L482 +inline void increment_version(TensorList tensors) { + for (const auto& t : tensors) { + t.unsafeGetTensorImpl()->bump_version(); + } +} + +// Initializes args and checks if all args are aligned +template +__device__ bool init_args( + T** args, + TensorListMetadata& tl, + const int64_t chunk_idx, + const int64_t chunk_size, + const int64_t tensor_loc) { + bool all_aligned = true; + for (int i = 0; i < depth; i++) { + args[i] = (T*)tl.addresses[i][tensor_loc]; + args[i] += chunk_idx * chunk_size; + + if (!is_aligned(args[i])) { + all_aligned = false; + } + } + return all_aligned; +} + +// Initializes args and checks if all args are aligned +template +__device__ bool init_args( + T** args, + TensorListScalarListMetadata& tl, + const int64_t chunk_idx, + const int64_t chunk_size, + const int64_t tensor_loc) { + bool all_aligned = true; + for (int i = 0; i < depth; i++) { + args[i] = (T*)tl.addresses[i][tensor_loc]; + args[i] += chunk_idx * chunk_size; + + if (!is_aligned(args[i])) { + all_aligned = false; + } + } + return all_aligned; +} + +template +__device__ bool init_args( + T** args, + FusedOptimizerTensorListMetadata& tl, + const int64_t chunk_idx, + const int64_t chunk_size, + const int64_t tensor_loc) { + bool all_aligned = true; + for (int i = 0; i < depth; i++) { + args[i] = (T*)tl.addresses[i][tensor_loc]; + args[i] += chunk_idx * chunk_size; + + if (!is_aligned(args[i])) { + all_aligned = false; + } + } + return all_aligned; +} + +template +__device__ void load_args( + T r_args[][kILP], + T** args, + const int64_t i_start, + const int64_t chunk_size, + const int64_t n) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + const auto i = i_start + threadIdx.x + ii * blockDim.x; + for (int r_index = 0; r_index < depth; r_index++) { + r_args[r_index][ii] = 0; + if (i < n && i < chunk_size) { + r_args[r_index][ii] = args[r_index][i]; + } + } + } +} + +template +__device__ void store_args( + T* dst, + T* src, + const int64_t i_start, + const int64_t chunk_size, + const int64_t n) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + const int64_t i = i_start + threadIdx.x + ii * blockDim.x; + if (i < n && i < chunk_size) + dst[i] = src[ii]; + } +} + +template +__device__ __forceinline__ void binary_op_scalar( + T r_args[][kILP], + T** args, + opmath_t scalar, + const int64_t n, + const int64_t chunk_size, + const bool all_aligned, + Op op) { + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(scalar))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + // Regardless if depth is 1 (for inplace) or 2 (for out of place), r_args + // has depth 1 + load_args<1>(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(scalar))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } +} + +template +__device__ __forceinline__ void pointwise_op_scalar( + T r_args[][kILP], + T** args, + opmath_t scalar, + const int64_t n, + const int64_t chunk_size, + const bool all_aligned, + Op op) { + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); + load_store(r_args[2], args[2], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + static_cast(r_args[0][ii]) + + scalar * + op(static_cast(r_args[1][ii]), + static_cast(r_args[2][ii]))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + // Regardless if depth is 3 (for inplace) or 4 (for out of place), r_args + // has depth 3 + load_args<3>(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + static_cast(r_args[0][ii]) + + scalar * + op(static_cast(r_args[1][ii]), + static_cast(r_args[2][ii]))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } +} + +// +// Binary Functors +// +template +struct BinaryOpScalarFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t scalar) { + const int tensor_loc = tl.block_to_tensor[blockIdx.x]; + const int chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + binary_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct BinaryOpScalarListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + opmath_t scalar = tl.scalar_vals[tensor_loc]; + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + binary_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct BinaryOpListAlphaFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t alpha) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + alpha * static_cast(r_args[1][ii]))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + alpha * static_cast(r_args[1][ii]))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct BinaryOpScalarTensorFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + T* scalar, + opmath_t alpha) { + const int tensor_loc = tl.block_to_tensor[blockIdx.x]; + const int chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast(op( + static_cast(r_args[0][ii]), + static_cast(alpha) * static_cast(*scalar))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + // Regardless if depth is 1 (for inplace) or 2 (for out of place), + // r_args has depth 1 + load_args<1>(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast(op( + static_cast(r_args[0][ii]), + static_cast(alpha) * static_cast(*scalar))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +// +// Unary Functors +// + +template +struct ZeroFunctor { + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata<1>& tl) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const auto all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = 0; + } + // store + load_store(args[0], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = 0; + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct UnaryOpFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + static_cast(op(static_cast(r_args[0][ii]))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + static_cast(op(static_cast(r_args[0][ii]))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +// +// Pointwise Functors +// + +template +struct PointwiseOpScalarFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t scalar) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + pointwise_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct PointwiseOpScalarListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + opmath_t scalar = tl.scalar_vals[tensor_loc]; + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + pointwise_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct PointwiseOpListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[depth - 1][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]))); + } + // store + load_store(args[2], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]))); + } + store_args(args[2], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct TernaryOpListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op) { + static_assert(depth == 3 || depth == 4, ""); + static_assert(depth >= r_args_depth, ""); + static_assert(res_arg_index == depth - 1 || res_arg_index == 0, ""); + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); + load_store(r_args[2], args[2], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + static_cast(r_args[2][ii])); + } + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + static_cast(r_args[2][ii])); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct TernaryOpScalarFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t alpha) { + static_assert(depth == 2 || depth == 3, ""); + static_assert(depth >= r_args_depth, ""); + static_assert(res_arg_index == depth - 1 || res_arg_index == 0, ""); + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + alpha); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + alpha); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct TernaryOpScalarListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + static_assert(depth == 2 || depth == 3, ""); + static_assert(depth >= r_args_depth, ""); + static_assert(res_arg_index == depth - 1 || res_arg_index == 0, ""); + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + const opmath_t scalar = tl.scalar_vals[tensor_loc]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + scalar); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + scalar); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct power_functor { + C10_DEVICE T operator()(const T& a, const T& b) const { + return at::native::pow_(a, b); + } +}; + +template +struct reverse_power_functor { + C10_DEVICE T operator()(const T& a, const T& b) const { + return at::native::pow_(b, a); + } +}; + +} // namespace +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh new file mode 100644 index 0000000000000000000000000000000000000000..302ed64991ab18b46d7e7bfe35cbd8482fc1c941 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +// std:: does not have clamp functors +template +struct minimum { + __device__ T operator()(const T& a, const T& b) const { + return (_isnan(a) || a < b) ? a : b; + } +}; + +template +struct maximum { + __device__ T operator()(const T& a, const T& b) const { + return (_isnan(a) || a > b) ? a : b; + } +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh new file mode 100644 index 0000000000000000000000000000000000000000..b7fca1a7e66a393118ce90102a8fae071e2f7fb3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh @@ -0,0 +1,326 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +using detail::GridSamplerInterpolation; +using detail::GridSamplerPadding; + +// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value, +// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5). +// if align_corners: -1 and +1 get sent to the centers of the corner pixels +// -1 --> 0 +// +1 --> (size - 1) +// scale_factor = (size - 1) / 2 +// if not align_corners: -1 and +1 get sent to the image edges +// -1 --> -0.5 +// +1 --> (size - 1) + 0.5 == size - 0.5 +// scale_factor = size / 2 +template +__forceinline__ __device__ +scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1.f) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1.f) * size - 1) / 2; + } +} + +// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize +// except that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +__forceinline__ __device__ +scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size, + bool align_corners, scalar_t *grad_in) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + *grad_in = static_cast(size - 1) / 2; + return ((coord + 1.f) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + *grad_in = static_cast(size) / 2; + return ((coord + 1.f) * size - 1) / 2; + } +} + +// Clips coordinates to between 0 and clip_limit - 1 +template +__forceinline__ __device__ +scalar_t clip_coordinates(scalar_t in, int clip_limit) { + return ::min(static_cast(clip_limit - 1), ::max(in, static_cast(0))); +} + +// clip_coordinates_set_grad works similarly to clip_coordinates except that +// it also returns the `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +__forceinline__ __device__ +scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) { + // Note that it is important for the gradient calculation that borders + // are considered out of bounds. + if (in <= static_cast(0)) { + *grad_in = static_cast(0); + return static_cast(0); + } else { + scalar_t max = static_cast(clip_limit - 1); + if (in >= max) { + *grad_in = static_cast(0); + return max; + } else { + *grad_in = static_cast(1); + return in; + } + } +} + +// Reflects coordinates until they fall between low and high (inclusive). +// The bounds are passed as twice their value so that half-integer values +// can be represented as ints. +template +__forceinline__ __device__ +scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = ::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + scalar_t extra = ::fmod(in, span); + int flips = static_cast(::floor(in / span)); + if (flips % 2 == 0) { + return extra + min; + } else { + return span - extra + min; + } +} + +// reflect_coordinates_set_grad works similarly to reflect_coordinates except +// that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +__forceinline__ __device__ +scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high, + scalar_t *grad_in) { + if (twice_low == twice_high) { + *grad_in = static_cast(0); + return static_cast(0); + } + int grad_in_mult_; + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = in - min; + if (in < static_cast(0)) { + grad_in_mult_ = -1; + in = -in; + } else { + grad_in_mult_ = 1; + } + // `fmod` returns same sign as `in`, which is positive after the `if` above. + scalar_t extra = ::fmod(in, span); + int flips = static_cast(::floor(in / span)); + if (flips % 2 == 0) { + *grad_in = static_cast(grad_in_mult_); + return extra + min; + } else { + *grad_in = static_cast(-grad_in_mult_); + return span - extra + min; + } +} + +template +__forceinline__ __device__ +scalar_t safe_downgrade_to_int_range(scalar_t x){ + // -100.0 does not have special meaning. This is just to make sure + // it's not within_bounds_2d or within_bounds_3d, and does not cause + // undefined behavior. See #35506. + if (x > INT_MAX-1 || x < INT_MIN || !::isfinite(static_cast(x))) + return static_cast(-100.0); + return x; +} + +template +__forceinline__ __device__ +scalar_t compute_coordinates(scalar_t coord, int size, + GridSamplerPadding padding_mode, + bool align_corners) { + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates(coord, 0, 2*(size - 1)); + } else { + coord = reflect_coordinates(coord, -1, 2*size - 1); + } + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } + + coord = safe_downgrade_to_int_range(coord); + return coord; +} + +// Computes the pixel source index value for a grid coordinate +template +__forceinline__ __device__ +scalar_t grid_sampler_compute_source_index( + scalar_t coord, + int size, + GridSamplerPadding padding_mode, + bool align_corners) { + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; +} + +// grid_sampler_compute_source_index_set_grad works similarly to +// grid_sampler_compute_source_index except that it also returns the +// `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +__forceinline__ __device__ +scalar_t grid_sampler_compute_source_index_set_grad( + scalar_t coord, + int size, + GridSamplerPadding padding_mode, + bool align_corners, + scalar_t *grad_in) { + scalar_t grad_clip, grad_refl; + coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in); + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_clip; + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl); + } else { + coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl); + } + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_refl * grad_clip; + } + + coord = safe_downgrade_to_int_range(coord); + return coord; +} + +__forceinline__ __device__ +bool within_bounds_2d(int h, int w, int H, int W) { + return h >= 0 && h < H && w >= 0 && w < W; +} + +__forceinline__ __device__ +bool within_bounds_3d(int d, int h, int w, int D, int H, int W) { + return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; +} + +template +__forceinline__ __device__ +scalar_t get_value_bounded( + const scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH, + GridSamplerPadding padding_mode, + bool align_corners) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int ix = static_cast(x); + int iy = static_cast(y); + + if (within_bounds_2d(iy, ix, H, W)) { + return data[iy * sH + ix * sW]; + } + return static_cast(0); +} + +template +__forceinline__ __device__ +void safe_add_2d(scalar_t *data, int h, int w, + int sH, int sW, int H, int W, + scalar_t delta, + const index_t NC_offset, + const index_t memory_span) { + if (within_bounds_2d(h, w, H, W)) { + fastAtomicAdd(data, + NC_offset + h * sH + w * sW, + memory_span, + delta, + true); + } +} + +template +__forceinline__ __device__ +void safe_add_3d(scalar_t *data, int d, int h, int w, + int sD, int sH, int sW, int D, int H, int W, + scalar_t delta, + const index_t NC_offset, + const index_t memory_span) { + if (within_bounds_3d(d, h, w, D, H, W)) { + fastAtomicAdd(data, + NC_offset + d * sD + h * sH + w * sW, + memory_span, + delta, + true); + } +} + +template +__forceinline__ __device__ +void add_value_bounded( + scalar_t* data, scalar_t x, scalar_t y, int W, int H, int sW, int sH, + scalar_t delta, + GridSamplerPadding padding_mode, + bool align_corners, + const index_t NC_offset, + const index_t memory_span) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int ix = static_cast(x); + int iy = static_cast(y); + + safe_add_2d(data, iy, ix, sH, sW, H, W, delta, NC_offset, memory_span); +} + +// Calculate the differential of the cubic convolution, i.e. `d coeff / d x` +template +__forceinline__ __device__ +void get_cubic_coefficients_grad( + scalar_t coeffs[4], + scalar_t t) { + + // Must be the same as forward calculation in + // aten/src/ATen/native/cuda/UpSample.cuh:get_cubic_upsample_coefficients + scalar_t A = -0.75; + + scalar_t x; + x = -1 - t; // 1 < x = |-1 - tx| < 2 + coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A; + x = -t; // x = |0 - tx| <= 1 + coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 1 - t; // x = |1 - tx| <= 1 + coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 2 - t; // 1 < x = |2 - tx| < 2 + coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A; +} + + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GridSampler.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GridSampler.h new file mode 100644 index 0000000000000000000000000000000000000000..3730e552317e77d4d9a110c6d330fa31fb2d4b47 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GridSampler.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +void launch_grid_sampler_2d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_3d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_2d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask); + +void launch_grid_sampler_3d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMM.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMM.h new file mode 100644 index 0000000000000000000000000000000000000000..bb4431cb332fc6cddd3eb0005c6ec97ff9ec6b01 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMM.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::cuda::detail { +TORCH_API void bf16bf16_grouped_mm( + at::Tensor mat_a, // bf16 + at::Tensor mat_b, // bf16 + std::optional offs, + std::optional bias, // BF16 + at::Tensor& out); +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMMCommon.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMMCommon.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1c036c8edba98f808f615ff5a522942a10b05bd3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMMCommon.cuh @@ -0,0 +1,161 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::cuda::detail { + +using Strides = std::array; + +template < + typename DtypeA, + typename DtypeB, + typename DtypeOutput, + typename DtypeScale, + typename ProblemShape, + typename StrideA, + typename StrideB, + typename StrideOutput> +__global__ void prepare_grouped_gemm_data( + DtypeA* A, + DtypeB* B, + DtypeOutput* output, + DtypeScale* scale_A, + DtypeScale* scale_B, + DtypeA** A_ptrs, + DtypeB** B_ptrs, + DtypeOutput** output_ptrs, + DtypeScale** inputA_scale_ptrs, + DtypeScale** inputB_scale_ptrs, + ProblemShape* problem_sizes, + // Strides for cutlass, cute::Stride + StrideA* stride_A, + StrideB* stride_B, + StrideOutput* stride_output, + const int32_t* offs, + int32_t M, + int32_t N, + int32_t K, + // Original strides of the input tensors + Strides tensor_StrideA, + Strides tensor_StrideB, + Strides tensor_StrideOutput, + Strides tensor_ShapeA, + Strides tensor_ShapeB, + int64_t a_scale_stride, + int64_t b_scale_stride, + bool a_row_major = true, + bool b_row_major = false) { + int32_t tid = threadIdx.x; + int32_t delta = 0; + int32_t offset = 0; + if (offs != nullptr) { + int32_t start = tid == 0 ? 0 : offs[tid - 1]; + offset = offs[tid]; + delta = offset - start; + CUDA_KERNEL_ASSERT(delta >=0 && "expected gemm dimension to be greater or equal 0\n"); + + // TMA transfers require global memory tensor addresses to be + // aligned to 16 bytes. + if (tid < blockDim.x - 1) { + // Check this requirement for input tensors, in case group + // addresses are increased along the dynamic dimension. + if ((K < 0 && a_row_major) || // 2D/2D: check along K dimension + (M < 0 && !a_row_major)) { // 3D/2D: check along N dimension + int align = 128 / cutlass::sizeof_bits::value; + CUDA_KERNEL_ASSERT( + delta % align == 0 && + "expected input tensor dynamic dimension byte size to be non-negative multiple of 16\n"); + } + if ((K < 0 && !b_row_major) || // 2D/2D: check along K dimension + (N < 0 && b_row_major)) { // 3D/2D: check along N dimension + int align = 128 / cutlass::sizeof_bits::value; + CUDA_KERNEL_ASSERT( + delta % align == 0 && + "expected input tensor dynamic dimension byte size to be non-negative multiple of 16\n"); + } + + // Check the same requirement for output tensor (that is always + // contiguous, and in row-major layout). + if (N < 0) { + int align = 128 / cutlass::sizeof_bits::value; + CUDA_KERNEL_ASSERT( + delta % align == 0 && + "expected output tensor dynamic dimension byte size to be non-negative multiple of 16\n"); + } + } + } + int64_t lda, ldb, ldoutput; + if (M < 0) { + // A and output is 2d + CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[0] && "expected offset to be less than tensor size\n"); + M = delta; + lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1]; + ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2]; + ldoutput = tensor_StrideOutput[0]; + A_ptrs[tid] = tid == 0 ? A : A + offs[tid - 1] * tensor_StrideA[0]; + if (scale_A != nullptr) { + inputA_scale_ptrs[tid] = tid == 0 ? scale_A : scale_A + offs[tid - 1]; + inputB_scale_ptrs[tid] = scale_B + tid * b_scale_stride; + } + output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1] * ldoutput; + B_ptrs[tid] = B + tid * tensor_StrideB[0]; + } else if (N < 0) { + CUDA_KERNEL_ASSERT(offset <= tensor_ShapeB[1] && "expected offset to be less than tensor size\n"); + N = delta; + lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2]; + ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; // B is transposed + ldoutput = tensor_StrideOutput[0]; + A_ptrs[tid] = A + tid * tensor_StrideA[0]; + output_ptrs[tid] = tid == 0 ? output : output + offs[tid - 1]; + B_ptrs[tid] = tid == 0 ? B : B + offs[tid - 1] * tensor_StrideB[1]; + if (scale_A != nullptr) { + inputA_scale_ptrs[tid] = scale_A + tid * a_scale_stride; + inputB_scale_ptrs[tid] = tid == 0 ? scale_B : scale_B + offs[tid - 1]; + } + } else if (K < 0) { + CUDA_KERNEL_ASSERT(offset <= tensor_ShapeA[1] && offset <= tensor_ShapeB[0] && "expected offset to be less than tensor size\n"); + // A, B is 2d, output is 3d + K = delta; + lda = a_row_major ? tensor_StrideA[0] : tensor_StrideA[1]; + ldb = b_row_major ? tensor_StrideB[0] : tensor_StrideB[1]; + ldoutput = tensor_StrideOutput[1]; + A_ptrs[tid] = tid == 0 ? A : A + offs[tid - 1] * tensor_StrideA[1]; + B_ptrs[tid] = tid == 0 ? B : B + offs[tid - 1] * tensor_StrideB[0]; + output_ptrs[tid] = output + tid * tensor_StrideOutput[0]; + if (scale_A != nullptr) { + inputA_scale_ptrs[tid] = scale_A + tid * M; + inputB_scale_ptrs[tid] = scale_B + tid * N; + } + } else { + // A, B, output are 3D + lda = a_row_major ? tensor_StrideA[1] : tensor_StrideA[2]; + ldb = b_row_major ? tensor_StrideB[1] : tensor_StrideB[2]; + ldoutput = tensor_StrideOutput[1]; + A_ptrs[tid] = A + tid * tensor_StrideA[0]; + B_ptrs[tid] = B + tid * tensor_StrideB[0]; + output_ptrs[tid] = output + tid * tensor_StrideOutput[0]; + if (scale_A != nullptr) { + inputA_scale_ptrs[tid] = scale_A + tid * a_scale_stride; + inputB_scale_ptrs[tid] = scale_B + tid * b_scale_stride; + } + } + problem_sizes[tid] = ProblemShape(M, N, K); + + // make_cute_packed_stride only replaces one of the stride elements with + // one the provided values in the shape arguments + // the indices of the src/dst depend on whether A/B are row-major + // so constructing shape argument with two similar lda values + // while it looks non-sensical (and it is a nonsensical shape) + // is fine for these stride construction purposes - the one that will be used + // for replacement is correct, the other one is ignored, and we don't have to + // branch on whether A/B are row-major + stride_A[tid] = cutlass::make_cute_packed_stride(StrideA{}, {lda, lda, 1}); + stride_B[tid] = cutlass::make_cute_packed_stride(StrideB{}, {ldb, ldb, 1}); + stride_output[tid] = + cutlass::make_cute_packed_stride(StrideOutput{}, {M, ldoutput, 1}); +} +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..47f808b8f8810d48dce14e3386639a118d99ad10 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernel.h @@ -0,0 +1,20 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +struct TensorIteratorBase; +class TensorBase; +} + +namespace at::native { +/// @param maskPrefixSum[in,out] +void launch_masked_scatter_kernel( + const TensorBase &self, const TensorBase &mask, + const TensorBase &maskPrefixSum, const TensorBase &source); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernelUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernelUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..d5053ac466b193ddca34470d7f859624b3668669 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernelUtils.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) + +#include +#include +#include + +namespace at::native { + +template +inline bool fast_gather_kernel_eligible(const TensorIterator& iter, char * const out_ptr, char * const in_ptr, const size_t index_stride_bytes, const size_t element_size) { + using at::native::memory::get_alignment; + const auto index_element_size = iter.element_size(2); + //TensorIterator strides and sizes are ordered fastest moving to slowest moving, + //in contrast to regular sizes + // we need contiguous source and dst slices and aligned pointers and strides and slice size to do vectorized loads + // also we need idx to be expanded in the last dimension so we can copy entire slices + // and we need the src tensor to keep 0 stride from restriding + // (it could have been deleted by dimension collapse, in this case iterator would still be 2d + // but we cannot use fast path) + + return iter.ndim() == 2 && iter.strides(2)[0]==0 && iter.strides(2)[1]==index_element_size && + static_cast(iter.strides(0)[0])==element_size && + static_cast(iter.strides(1)[0])==element_size && static_cast(iter.strides(1)[1] == 0) && + get_alignment(out_ptr) == alignment && get_alignment(in_ptr) == alignment && + get_alignment(static_cast(iter.shape()[0] * element_size)) == alignment && + get_alignment(static_cast(index_stride_bytes)) == alignment && + get_alignment(static_cast(iter.strides(0)[1])) == alignment; +} + +template +void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int num_ind, + int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, + bool allow_neg_indices=false); + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..efd0967b118f633bc15aba5f836bb785a34fceef --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh @@ -0,0 +1,191 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#if AT_USE_JITERATOR() + +#include + +#include +#include +#include + +#include + +#include + +namespace at::native { + +/* Note [Jiterator] +The "jiterator" simply just-in-time compiles the same kernels that +Loops.cuh (and CUDALoops.cuh) usually build. This reduces build time, +build size, and initial CUDA context size. + +By default on non-Windows systems, it also caches compiled kernels in ~/.cache/torch/kernels. +This behavior is controlled with two environment variables: + - USE_PYTORCH_KERNEL_CACHE, if set to zero then this will disable all cache use + - PYTORCH_KERNEL_CACHE_PATH, if set specifies the folder to use for cached kernels + +The jiterator currently has some limitations, however. It cannot: + - handle math on complex datatypes + - handle kernels with scalar parameters + +These improvements will likely come soon. + +For examples of how to use the jiterator see the i1 and gcd kernel +implementations, which pass jittable strings implementing their +operations instead of the typical CUDA functors. + +To pass a runtime argument (similar to lambda captures in non-JIT kernels), +we need to pass to additional arguments to `jitted_gpu_kernel` by value. +Currently only primitive C++ types used for computation are valid. +The order of these extra arguments should be same as the order they appear +in kernel's function signature. (look at polygamma for example) + +NOTE: One big restriction being that these arguments should be after the +arguments provided by TensorIterator. Eg. While capturing `n`, where +`scalar_t x` and `scalar_t y` are provided by TensorIterator, +* foo(scalar_t x, scalar_t y, int n) works! +* foo(int n, scalar_t x, scalar_y) doesn't work +* foo(scalar_t x, int n, scalar_y) doesn't work + +*/ + +// Entrypoint for jitted GPU kernels. +// Only handles elementwise unary and binary kernels with a +// common dtype and a single output. +// NOTE: this assumes the op's iterator has a common_dtype. +// NOTE: We use std::tuple instead of parameter pack +// for `extra_args` due to following +// bug on older versions of clang +// https://bugs.llvm.org/show_bug.cgi?id=23029 +template < + char const* name, + typename return_type, + typename f_inputs_type, + int arity, + typename... Args> +void jitted_gpu_kernel( + TensorIteratorBase& iter, + const std::string& f, + at::cuda::jit::BinaryFuncVariant scalar_pos = + at::cuda::jit::BinaryFuncVariant::NoScalar, + at::opmath_type scalar_val = 0, + std::tuple extra_args = std::make_tuple()) { + // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel + // Maybe it could be refactored? + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT( + iter.device(arg).is_cuda(), + "argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + jitted_gpu_kernel( + sub_iter, f, scalar_pos, scalar_val, extra_args); + } + + return; + } + + // Computes if dynamic casting is needed + // Dynamic casting is needed if an input's dtype differs from the common dtype + // or if the result dtype differs from the output's dtype + // Note: this is intentionally divergent from calling needs_dynamic_casting, + // which is more general and inspects a lambda to determine if dynamic + // casting is needed. + bool needs_dynamic_casting = false; + + // Checks output + const ScalarType return_scalar_type = c10::CppTypeToScalarType::value; + const auto dtype0 = iter.dtype(0); + if (dtype0 != return_scalar_type) { + needs_dynamic_casting = true; + } + + // Checks input(s) + const ScalarType inputs_scalar_type = c10::CppTypeToScalarType::value; + for (auto i = decltype(arity){1}; i < (arity + 1); ++i) { + const auto dtypei = iter.dtype(i); + if (dtypei != inputs_scalar_type) { + needs_dynamic_casting = true; + break; + } + } + if (scalar_pos == at::cuda::jit::BinaryFuncVariant::NoScalar) { + // NOTE: With `scalar_pos=NoScalar`,`scalar_val` is not used + // for computation in the generated code and hence we pass a dummy + // value of `0`. + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::NoScalar>( + iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args); + } else if (scalar_pos == at::cuda::jit::BinaryFuncVariant::RhsScalar) { + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::RhsScalar>( + iter, + f, + needs_dynamic_casting, + scalar_val, + extra_args); + + } else { + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::LhsScalar>( + iter, + f, + needs_dynamic_casting, + scalar_val, + extra_args); + } +} + +// TODO: support runtime state capture similar to `jitted_gpu_kernel`. +template +void opmath_jitted_gpu_kernel_with_scalars(TensorIteratorBase& iter, const std::string& f) { + TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); + //currently jiterator only handles binary functions where both inputs are of the same type (f_inputs_type) + using opmath_t = at::opmath_type; + if (iter.is_cpu_scalar(1)) { + auto scalar_val = iter.scalar_value(1); + iter.remove_operand(1); + // TODO: When all kernels that use gpu_kernel_with_scalars are + // ported to structured, this device guard can be deleted. This + // works around incorrect device guard generation for pre-structured + // kernels device guards, but structured kernels do it right and + // we can assume the device is already set correctly + const OptionalDeviceGuard device_guard(iter.device(1)); + jitted_gpu_kernel(iter, f, at::cuda::jit::BinaryFuncVariant::LhsScalar, scalar_val); + } else if (iter.is_cpu_scalar(2)) { + auto scalar_val = iter.scalar_value(2); + iter.remove_operand(2); + jitted_gpu_kernel(iter, f, at::cuda::jit::BinaryFuncVariant::RhsScalar, scalar_val); + } else { + jitted_gpu_kernel(iter, f); + } +} + +} // namespace at::native + +#endif // AT_USE_JITERATOR() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..73a5ba0a479bb98cbd79e7b0312a6a778287a056 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh @@ -0,0 +1,411 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#if !(defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))) +#include +#endif + +// ROCm 6.3 is planned to have these functions, but until then here they are. +#if defined(USE_ROCM) +#include +#include +#include + +__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) { +#if (defined(__gfx942__)) && \ + __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16) + typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2; + static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw)); + union { + __hip_bfloat162_raw bf162_raw; + vec_short2 vs2; + } u{static_cast<__hip_bfloat162_raw>(value)}; + u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2); + return static_cast<__hip_bfloat162>(u.bf162_raw); +#else + static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw)); + union u_hold { + __hip_bfloat162_raw h2r; + unsigned int u32; + }; + u_hold old_val, new_val; + old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + do { + new_val.h2r = __hadd2(old_val.h2r, value); + } while (!__hip_atomic_compare_exchange_strong( + (unsigned int*)address, &old_val.u32, new_val.u32, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)); + return old_val.h2r; +#endif +} + +__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) { +#if (defined(__gfx942__)) && \ + __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16) + // The api expects an ext_vector_type of half + typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162; + static_assert(sizeof(vec_fp162) == sizeof(__half2_raw)); + union { + __half2_raw h2r; + vec_fp162 fp16; + } u {static_cast<__half2_raw>(value)}; + u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16); + return static_cast<__half2>(u.h2r); +#else + static_assert(sizeof(__half2_raw) == sizeof(unsigned int)); + union u_hold { + __half2_raw h2r; + unsigned int u32; + }; + u_hold old_val, new_val; + old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + do { + new_val.h2r = __hadd2(old_val.h2r, value); + } while (!__hip_atomic_compare_exchange_strong( + (unsigned int*)address, &old_val.u32, new_val.u32, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)); + return old_val.h2r; +#endif +} +#define ATOMICADD preview_unsafeAtomicAdd +#define NATIVE_ZERO_BF16 __float2bfloat16(0.0f) +#else +#define ATOMICADD atomicAdd +#define NATIVE_ZERO_BF16 __int2bfloat16_rz(0) +#endif + +namespace at:: native { + +__device__ __forceinline__ size_t +idx(const size_t nc, + const size_t height, + const size_t width, + const size_t h, + const size_t w) { + return (nc * height + h) * width + w; +} + +// for channels-last +__device__ __forceinline__ size_t +idx_cl( + const size_t n, const size_t h, const size_t w, const size_t c, + const size_t height, const size_t width, const size_t channel +) { + return ((n * height + h) * width + w) * channel + c; +} + +// fastSpecializedAtomicAdd (and fastAtomicAdd) are an optimization +// that speed up half-precision atomics. The situation with half +// precision atomics is that we have a slow __half atomic, and +// a fast vectored __half2 atomic (this can be worth up to a 6x +// speedup, see https://github.com/pytorch/pytorch/pull/21879). +// We can convert a __half atomic into a __half2 atomic by simply +// pairing the __half with a zero entry on the left/right depending +// on alignment... but only if this wouldn't cause an out of bounds +// access! Thus, you must specify tensor and numel so we can check +// if you would be out-of-bounds and use a plain __half atomic if +// you would be. +template < + typename scalar_t, + typename index_t, + typename std::enable_if_t>* = + nullptr> +__device__ __forceinline__ void fastSpecializedAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value) { +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)) + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // Accounts for the chance tensor falls on an odd 16 bit alignment (ie, not 32 bit aligned) + __half* target_addr = reinterpret_cast<__half*>(tensor + index); + bool low_byte = (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + + if (low_byte && index < (numel - 1)) { + __half2 value2; + value2.x = static_cast<__half>(value); + value2.y = __int2half_rz(0); + ATOMICADD(reinterpret_cast<__half2*>(target_addr), value2); + + } else if (!low_byte && index > 0) { + __half2 value2; + value2.x = __int2half_rz(0); + value2.y = static_cast<__half>(value); + ATOMICADD(reinterpret_cast<__half2*>(target_addr - 1), value2); + + } else { +#ifdef USE_ROCM + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, static_cast(value)); +#else + atomicAdd( + reinterpret_cast<__half*>(tensor) + index, static_cast<__half>(value)); +#endif + } +#endif +} + +template < + typename scalar_t, + typename index_t, + typename std::enable_if_t>* = + nullptr> +__device__ __forceinline__ void fastSpecializedAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value) { +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)) + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // Accounts for the chance tensor falls on an odd 16 bit alignment (ie, not 32 bit aligned) + __nv_bfloat16* target_addr = reinterpret_cast<__nv_bfloat16*>(tensor + index); + bool low_byte = (reinterpret_cast(target_addr) % sizeof(__nv_bfloat162) == 0); + + if (low_byte && index < (numel - 1)) { + __nv_bfloat162 value2; + value2.x = *reinterpret_cast<__nv_bfloat16*>(&value); + value2.y = NATIVE_ZERO_BF16; + ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr), value2); + + } else if (!low_byte && index > 0) { + __nv_bfloat162 value2; + value2.x = NATIVE_ZERO_BF16; + value2.y = *reinterpret_cast<__nv_bfloat16*>(&value); + ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2); + + } else { +#ifdef USE_ROCM + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, static_cast(value)); +#else + atomicAdd( + reinterpret_cast<__nv_bfloat16*>(tensor) + index, *reinterpret_cast<__nv_bfloat16*>(&value)); +#endif + } +#endif +} + + +template < + typename scalar_t, + typename index_t, + typename std::enable_if_t && !std::is_same_v>* = + nullptr> +__device__ __forceinline__ void fastSpecializedAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value) { + gpuAtomicAddNoReturn(tensor + index, value); +} + +template +__device__ __forceinline__ void fastAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value, + bool fast_atomics) { + if (fast_atomics) { + fastSpecializedAtomicAdd(tensor, index, numel, value); + } else { + gpuAtomicAddNoReturn(tensor + index, value); + } +} + + +#ifdef USE_ROCM +// This function implements a committed store. +// Upon returning, the store is committed to global memory. +// This is useful in avoiding the need for fences. +template +__device__ inline void cmtdStore(void* address, T value) { + int constexpr num_long_per_val = sizeof(value)/sizeof(long); + int constexpr num_int_per_val = sizeof(value)/sizeof(int); + int constexpr num_short_per_val = sizeof(value)/sizeof(short); + int constexpr num_char_per_val = sizeof(value)/sizeof(char); + union pnr { T v; + long l[num_long_per_val]; + int i[num_int_per_val]; + short s[num_short_per_val]; + char c[num_char_per_val]; } + _pnr = {.v = value }; + if constexpr (num_long_per_val*sizeof(long) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.l[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + else if constexpr (num_int_per_val*sizeof(int) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.i[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + else if constexpr (num_short_per_val*sizeof(short) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.s[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + else if constexpr (num_char_per_val*sizeof(char) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + asm volatile("s_waitcnt vmcnt(0)" ::: "memory"); + __atomic_signal_fence(__ATOMIC_SEQ_CST); +} +#endif + +#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)) +// This function implements warp-level opportunistic fastatomics +// To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd. +// We identify all the threads within a warp that will perform an atomicAdd on the same destination +// address and perform the addition on the CU. Each warp elects a leader thread which does the +// atomicAdd to the destination address. +template +__device__ __forceinline__ void opportunistic_fastAtomicAdd( + scalar_t* self_ptr, + index_t index, + const index_t numel, + scalar_t value) { + + scalar_t* dst = self_ptr + index; + + //pack coalesced bf16 and fp16 + if constexpr (std::is_same::value || std::is_same::value) + { + typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2; + union ill { unsigned int i[2]; int64_t il; }; + ill iil_, ill_oneUpDst = {}; + iil_.il = (int64_t)dst; + ill_oneUpDst.i[0] = __builtin_amdgcn_mov_dpp(iil_.i[0], 0x130, 0xf, 0xf, 0); + ill_oneUpDst.i[1] = __builtin_amdgcn_mov_dpp(iil_.i[1], 0x130, 0xf, 0xf, 0); + union bfi {scalar_t bf; short s; } bfi_ = { .bf = value }; bfi bfi_oneUpVal; + + bfi_oneUpVal.s = __builtin_amdgcn_mov_dpp(bfi_.s, 0x130, 0xf, 0xf, 0); + auto oneUpVal = bfi_oneUpVal.bf; + + __half* target_addr = reinterpret_cast<__half*>(self_ptr + index); + bool low_byte = (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + bool canCombnUp = (bool)(__activemask()&(1<<(threadIdx.x+1))) && + (low_byte && index < (numel - 1)) && + (ill_oneUpDst.il - reinterpret_cast(dst) == sizeof(scalar_t)); + bool canCombnDn = (__builtin_amdgcn_mov_dpp(canCombnUp, 0x138, 0xf, 0xf, 0)); + + if (__lane_id()%2==0) + { + if (canCombnUp) { + typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162; + union bfvs { scalar_t bf[2]; vec_short2 vs2; vec_fp162 df16; }; + bfvs bfvs_ = {}; + bfvs_.bf[0] = value; + bfvs_.bf[1] = oneUpVal; + if constexpr (std::is_same::value) + __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)dst, bfvs_.vs2); + else + __builtin_amdgcn_flat_atomic_fadd_v2f16((__half2*)dst, bfvs_.df16); + return; + } + } + else + { + if (canCombnDn) + return; + } + } + + // not coalesced, so now let try to capture lane-matches... + + if (numel > 16 /*<-hueristic threshold*/ * 64 ) { + // well shucks, unlikely to capture same-dest atomics in a wave. + // fall back to direct fastAtomic... + fastAtomicAdd(self_ptr, index, numel, value, true); + return; + } + + // __activemask() -- finds the set of threads in the warp that are about to perform atomicAdd + // __match_any_sync() -- returns bit mask of the threads that have same dest addr + auto mask = __match_any_sync(__activemask(), (int64_t)dst); + + // select a leader thread + int leader = __ffsll(mask) - 1; + + scalar_t crnt_val = (scalar_t)0; + auto crnt_msk = mask >> (leader); + int crnt_idx = leader; + + // __shfl is limited in the dtypes it accepts + // That's why, we need these if/else to correctly do the addition on the CU + if constexpr(sizeof(scalar_t) <= sizeof(int)) { + union punner { int l; scalar_t s; }; + punner pnr = {}; + pnr.s = value; + while (crnt_msk != 0) { + if (crnt_msk & 1) { + punner add_val = {}; + add_val.l = __shfl(pnr.l ,crnt_idx); + crnt_val += add_val.s; + } + crnt_idx++; + crnt_msk = crnt_msk >> 1; + } + } + else if constexpr(sizeof(scalar_t) <= sizeof(long)) { + union punner { long l; scalar_t s; }; + punner pnr = {}; + pnr.s = value; + while (crnt_msk != 0) { + if (crnt_msk & 1) { + punner add_val = {}; + add_val.l = __shfl(pnr.l ,crnt_idx); + crnt_val += add_val.s; + } + crnt_idx++; + crnt_msk = crnt_msk >> 1; + } + } + else if constexpr(sizeof(scalar_t) <= sizeof(long long)) { + union punner { long long l; scalar_t s; }; + punner pnr = {}; + pnr.s = value; + while (crnt_msk != 0) { + if (crnt_msk & 1) { + punner add_val = {}; + add_val.l = __shfl(pnr.l ,crnt_idx); + crnt_val += add_val.s; + } + crnt_idx++; + crnt_msk = crnt_msk >> 1; + } + } + else { + union punner { long long l[2]; scalar_t s; }; + punner pnr = {}; + pnr.s = value; + while (crnt_msk != 0) { + if (crnt_msk & 1) { + punner add_val = {}; + add_val.l[0] = __shfl(pnr.l[0] ,crnt_idx); + add_val.l[1] = __shfl(pnr.l[1] ,crnt_idx); + crnt_val += add_val.s; + } + crnt_idx++; + crnt_msk = crnt_msk >> 1; + } + } + + + //Once the correct crnt_val is determined, only the leader thread does the update to the dest addr + if (__lane_id() == leader) { + fastAtomicAdd(self_ptr, index, numel, crnt_val, true); + } +} +#endif + +#undef ATOMICADD +#undef NATIVE_ZERO_BF16 + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..793922407e457e5971bd14d03cb29fb9d275655c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/LaunchUtils.h @@ -0,0 +1,21 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +// returns 2**floor(log2(n)) +static int lastPow2(unsigned int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Loops.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Loops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a856720d0a9d478b9e41dc8b065d7e03843a6411 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Loops.cuh @@ -0,0 +1,338 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + + + +namespace at::native { + +template +static OffsetCalculator make_input_offset_calculator(const TensorIteratorBase& iter) { + // array size can not be 0, this happens when N == 0 + constexpr int array_size = std::max(N, 1); + TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs()); + std::array strides; + int64_t element_sizes[array_size]; + for (int i = 0; i < N; i++) { + strides[i] = iter.strides(i + iter.noutputs()).data(); + element_sizes[i] = iter.element_size(i + iter.noutputs()); + } + return OffsetCalculator(iter.ndim(), iter.shape().data(), strides.data(), element_sizes); +} + +template +static OffsetCalculator make_output_offset_calculator(const TensorIteratorBase& iter) { + TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs()); + std::array strides; + int64_t element_sizes[num_outputs]; + for (int i = 0; i < num_outputs; i++) { + strides[i] = iter.strides(i).data(); + element_sizes[i] = iter.element_size(i); + } + return OffsetCalculator(iter.ndim(), iter.shape().data(), strides.data(), element_sizes); +} + +template +__device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) { + using traits = function_traits; + using return_t = typename traits::result_type; + using args_t = typename traits::ArgsTuple; + constexpr int elems_per_thread = policy_t::tws; + + int idx = blockIdx.x; + if constexpr (reverted_idx) + idx = gridDim.x - blockIdx.x - 1; + + return_t results[elems_per_thread]; + args_t args[elems_per_thread]; + + // load + policy.load(args, idx); + + // compute + #pragma unroll + for (int i = 0; i < elems_per_thread; i++) { + if (policy.check_inbounds(i)) { +#if defined(__HIP__) + results[i] = c10::guts::apply(f, args[i]); +#else + results[i] = std::apply(f, args[i]); +#endif + } + } + + // store + policy.store(results, idx); +} + +} // namespace at::native + +#include + +namespace at:: native { + +template +void gpu_kernel_nocast(TensorIteratorBase& iter, const func_t& f) { + + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT( + iter.device(arg).is_cuda(), + "argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + gpu_kernel_nocast(sub_iter, f); + } + return; + } + + gpu_kernel_impl_nocast(iter, f); +} + +template +void gpu_kernel(TensorIteratorBase& iter, const func_t& f) { + + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT( + iter.device(arg).is_cuda(), + "argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + gpu_kernel(sub_iter, f); + } + return; + } + + gpu_kernel_impl(iter, f); +} + +template +struct AUnaryFunctor { + using traits = function_traits; + using opmath_arg1_t = typename traits::template arg<0>::type; + __device__ return_t operator()(arg2_t b) const { + return f(a, b); + } + // NB: scalar is stored in higher precision! + AUnaryFunctor(func_t f_, opmath_arg1_t a_): f(f_), a(a_) {} + private: + func_t f; + opmath_arg1_t a; +}; + +template +struct BUnaryFunctor { + using traits = function_traits; + using opmath_arg2_t = typename traits::template arg<1>::type; + __device__ return_t operator()(arg1_t a) const { + return f(a, b); + } + // NB: scalar is stored in higher precision! + BUnaryFunctor(func_t f_, opmath_arg2_t b_): f(f_), b(b_) {} + private: + func_t f; + opmath_arg2_t b; +}; + +// Though seemingly noop, this inserts casts from arg1_t to func_t's type +// (which may be higher precision), as well as casts to return_t +template +struct BinaryFunctor { + __device__ return_t operator()(arg1_t a, arg2_t b) const { + return f(a, b); + } + BinaryFunctor(func_t f_): f(f_) {} + private: + func_t f; +}; + +// Unlike gpu_kernel_with_scalars, this allows you to pass a func_t which +// accepts inputs at higher precision (typically opmath_t), but then +// ensure that we load from memory at the correct precision (scalar_t) +// to avoid expensive loads. For the whole sordid story see +// https://dev-discuss.pytorch.org/t/cuda-loops-case-study-code-generation-vs-templates/302 +template +void opmath_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { + TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); + + using traits = function_traits; + using opmath_arg1_t = typename traits::template arg<0>::type; + using opmath_arg2_t = typename traits::template arg<1>::type; + static_assert( + traits::arity == 2, + "gpu_kernel_with_scalars only supports two input arguments"); + + if (iter.is_cpu_scalar(1)) { + AUnaryFunctor af(f, iter.scalar_value(1)); + iter.remove_operand(1); + // TODO: When all kernels that use gpu_kernel_with_scalars are + // ported to structured, this device guard can be deleted. This + // works around incorrect device guard generation for pre-structured + // kernels device guards, but structured kernels do it right and + // we can assume the device is already set correctly + const OptionalDeviceGuard device_guard(iter.device(1)); + gpu_kernel(iter, af); + } else if (iter.is_cpu_scalar(2)) { + BUnaryFunctor bf(f, iter.scalar_value(2)); + iter.remove_operand(2); + gpu_kernel(iter, bf); + } else { + gpu_kernel(iter, BinaryFunctor(f)); + } +} + +template +void opmath_symmetric_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { + // Use symmetric property of the functor to reduce number of kernels, + // requires f(a, b) == f(b, a) + TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); + + using traits = function_traits; + using opmath_arg_t = typename traits::template arg<0>::type; + static_assert( + traits::arity == 2, + "gpu_kernel_with_scalars only supports two input arguments"); + static_assert(std::is_same_v::type>, + "f is not symmetric"); + + OptionalDeviceGuard device_guard; + opmath_arg_t scalar_val{}; + + if (iter.is_cpu_scalar(1)) { + scalar_val = iter.scalar_value(1); + iter.remove_operand(1); + + // TODO: When all kernels that use gpu_kernel_with_scalars are + // ported to structured, this device guard can be deleted. This + // works around incorrect device guard generation for pre-structured + // kernels device guards, but structured kernels do it right and + // we can assume the device is already set correctly + device_guard.reset_device(iter.device(1)); + } else if (iter.is_cpu_scalar(2)) { + scalar_val = iter.scalar_value(2); + iter.remove_operand(2); + } + + if (iter.ninputs() == 2) { + gpu_kernel(iter, BinaryFunctor(f)); + } else { + AUnaryFunctor unary_f(f, scalar_val); + gpu_kernel(iter, unary_f); + } +} + +// Legacy variant that assumes that func_t has the correct types +// that we expect to load from memory +template +void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) { + using traits = function_traits; + static_assert( + traits::arity == 2, + "gpu_kernel_with_scalars only supports two input arguments"); + using arg1_t = typename traits::template arg<0>::type; + using arg2_t = typename traits::template arg<1>::type; + using return_t = typename traits::result_type; + opmath_gpu_kernel_with_scalars(iter, f); +} + +namespace { // functions for `gpu_kernel_multiple_outputs`. + +// check the return type is `thrust::tuple`, not `std::tuple`. +template struct is_tuple: std::false_type {}; + +template struct is_tuple>: std::true_type {}; + +template +C10_LAUNCH_BOUNDS_1(num_threads()) +__global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) { + int remaining = N - block_work_size() * blockIdx.x; + elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll(data, remaining, ic, oc)); +} + +template +static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) { + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + int64_t grid = (N + block_work_size() - 1) / block_work_size(); + auto stream = at::cuda::getCurrentCUDAStream(); + unrolled_elementwise_kernel_for_multi_outputs<<>>(N, f, data, ic, oc); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void gpu_kernel_multiple_outputs_impl(TensorIteratorBase& iter, const func_t& f) { + using traits = function_traits; + using output_t = typename traits::result_type; + static_assert(is_tuple::value, "f's return type must be `thrust::tuple`"); + constexpr int num_outputs = thrust::tuple_size::value; + constexpr int num_inputs = traits::arity; + constexpr int ntensors = num_outputs + num_inputs; + + TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); + TORCH_INTERNAL_ASSERT(iter.ntensors() == ntensors); + + std::array data; + for (int i = 0; i < ntensors; i++) { + data[i] = (char*)iter.data_ptr(i); + } + + int64_t numel = iter.numel(); + + if (iter.is_contiguous()) { + auto input_calc = TrivialOffsetCalculator(); + auto output_calc = TrivialOffsetCalculator(); + launch_unrolled_kernel_for_multi_outputs(numel, f, data, input_calc, output_calc); + } else { + auto input_calc = make_input_offset_calculator(iter); + auto output_calc = make_output_offset_calculator(iter); + launch_unrolled_kernel_for_multi_outputs(numel, f, data, input_calc, output_calc); + } +} +} // namespace + +template +void gpu_kernel_multiple_outputs(TensorIteratorBase& iter, const func_t& f) { + ASSERT_HOST_DEVICE_LAMBDA(func_t); + + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT(iter.device(arg).is_cuda()); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + gpu_kernel_multiple_outputs(sub_iter, f); + } + return; + } + + gpu_kernel_multiple_outputs_impl(iter, f); +} + +} //namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Math.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Math.cuh new file mode 100644 index 0000000000000000000000000000000000000000..76971eeb80c5606b733cc1d983f229fb10ef998f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Math.cuh @@ -0,0 +1,3395 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { +// See note [Jiterator] +// TODO: elaborate in this comment on the structure of math.cuh +#if AT_USE_JITERATOR() + +const auto ndtri_string = jiterator_stringify( + /* + * This function is derived from the implementation of the digamma function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + */ + template + T polevl(const T x, const T A[], const int len) { + // NOTE: This `polevl` is different from other `polevl` + // implementation (in PyTorch) which expect the `len` to be + // `len(A) - 1` instead of `len(A)`. + T result = 0; + for (int i = 0; i < len; ++i) { + result = result * x + A[i]; + } + return result; + } + + /* + * This function is derived from the implementation of the i1e function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Computes the argument, x, for which the area under the Gaussian probability density function + * (integrated from minus infinity to x) is equal to y. + */ + template + T ndtri(T y0) { + + constexpr T zero = 0; + constexpr T one = 1; + + // Handles special cases + if (y0 == zero) { + return NEG_INFINITY; + } + if (y0 == one) { + return POS_INFINITY; + } + if (y0 < zero || y0 > one) { + return NAN; + } + + bool code = true; + T y = y0; + // Note: the constant 0.135... is equal to exp(-2) + if (y > one - T{0.13533528323661269189}) { + y = one - y; + code = false; + } + + if (y > T{0.13533528323661269189}) { + /* approximation for 0 <= |y - 0.5| <= 3/8 */ + static const T P0[5] = { + -5.99633501014107895267E1, + 9.80010754185999661536E1, + -5.66762857469070293439E1, + 1.39312609387279679503E1, + -1.23916583867381258016E0, + }; + + static const T Q0[9] = { + 1.00000000000000000000E0, + 1.95448858338141759834E0, + 4.67627912898881538453E0, + 8.63602421390890590575E1, + -2.25462687854119370527E2, + 2.00260212380060660359E2, + -8.20372256168333339912E1, + 1.59056225126211695515E1, + -1.18331621121330003142E0, + }; + + /* sqrt(2pi) */ + constexpr T s2pi = 2.50662827463100050242E0; + + y = y - T{0.5}; + const T y2 = y * y; + T x = y + y * (y2 * polevl(y2, P0, int{5}) / polevl(y2, Q0, int{9})); + return x * s2pi; + } + + T x = sqrt(T{-2.} * log(y)); + const T x0 = x - (log(x) / x); + + const T z = one / x; + T x1; + + /* y > exp(-32) = 1.2664165549e-14 */ + if (x < T{8.0}) { + /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8 + * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14. + */ + static const T P1[9] = { + 4.05544892305962419923E0, + 3.15251094599893866154E1, + 5.71628192246421288162E1, + 4.40805073893200834700E1, + 1.46849561928858024014E1, + 2.18663306850790267539E0, + -1.40256079171354495875E-1, + -3.50424626827848203418E-2, + -8.57456785154685413611E-4, + }; + + static const T Q1[9] = { + 1.00000000000000000000E0, + 1.57799883256466749731E1, + 4.53907635128879210584E1, + 4.13172038254672030440E1, + 1.50425385692907503408E1, + 2.50464946208309415979E0, + -1.42182922854787788574E-1, + -3.80806407691578277194E-2, + -9.33259480895457427372E-4, + }; + + x1 = z * polevl(z, P1, int{9}) / polevl(z, Q1, int{9}); + } else { + /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64 + * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890. + */ + static const T P2[9] = { + 3.23774891776946035970E0, + 6.91522889068984211695E0, + 3.93881025292474443415E0, + 1.33303460815807542389E0, + 2.01485389549179081538E-1, + 1.23716634817820021358E-2, + 3.01581553508235416007E-4, + 2.65806974686737550832E-6, + 6.23974539184983293730E-9, + }; + + static const T Q2[9] = { + 1.00000000000000000000E0, + 6.02427039364742014255E0, + 3.67983563856160859403E0, + 1.37702099489081330271E0, + 2.16236993594496635890E-1, + 1.34204006088543189037E-2, + 3.28014464682127739104E-4, + 2.89247864745380683936E-6, + 6.79019408009981274425E-9, + }; + + x1 = z * polevl(z, P2, int{9}) / polevl(z, Q2, int{9}); + } + + x = x0 - x1; + return (!code) ? x : -x; + } +); // ndtri_string + +const auto log_ndtr_string = jiterator_stringify( + template + T log_ndtr(T x) { + constexpr T SQRT1_2{0.707106781186547524400844362104849039}; // 1/sqrt(2) + T t = x * SQRT1_2; + if (x < T{-1.0}) { + return log(erfcx(-t) / 2) - t * t; + } else { + return log1p(-erfc(t) / 2); + } + } +); // log_ndtr_string + +const auto gcd_string = jiterator_stringify( + template + T gcd(const T a_in, const T b_in) { + T a = abs(a_in); + T b = abs(b_in); + + while (a != T{0}) { + T c = a; + a = b % a; + b = c; + } + + return b; + } +); // gcd_string + +const auto lcm_string = jiterator_stringify( + template + T gcd(const T a_in, const T b_in) { + T a = abs(a_in); + T b = abs(b_in); + + while (a != T{0}) { + T c = a; + a = b % a; + b = c; + } + + return b; + } + + template + T lcm(const T a, const T b) { + T g = gcd(a, b); + return (g == T{0}) ? T{0} : abs(a / g * b); + } +); // lcm_string + +/* + * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma +const auto digamma_string = jiterator_stringify( + template + T digamma(T x) { + static constexpr double PI_f64 = 3.14159265358979323846; + + // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard + if (x == 0) { + return copysign(POS_INFINITY, -x); + } + + T result = 0; + if (x < 0) { + // Short-circuits if x is a negative integer and returns NaN + // per the C++ standard + const bool x_is_integer = (x == trunc(x)); + if (x_is_integer) { + return NAN; + } + + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = modf(static_cast(x), &q); + result = - PI_f64 / tan(PI_f64 * r); + x = 1 - x; + } + + while (x < T{10}) { + result -= T{1} / x; + x += T{1}; + } + + if (x == T{10}) { + return result + T{2.25175258906672110764}; + } + + T y = 0; + if (x < T{1.0e17}) { + const T A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + + T z = T{1} / (x * x); + + T polevl_result = 0; + for (int i = 0; i <= 6; i++) { + polevl_result = polevl_result * z + A[i]; + } + y = z * polevl_result; + } + + return log(x) - (T{0.5} / x) - y + result; + } +); // digamma_string + +/* + * This function is derived from the implementation of the zeta function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + */ +const auto zeta_string = jiterator_stringify( + template + T zeta(T x, T q) { + const T MACHEP{1.11022302462515654042E-16}; + constexpr T zero{0}; + constexpr T half{0.5}; + constexpr T one{1}; + static const T A[] = { + 12.0, + -720.0, + 30240.0, + -1209600.0, + 47900160.0, + -1.8924375803183791606e9, /*1.307674368e12/691*/ + 7.47242496e10, + -2.950130727918164224e12, /*1.067062284288e16/3617*/ + 1.1646782814350067249e14, /*5.109094217170944e18/43867*/ + -4.5979787224074726105e15, /*8.028576626982912e20/174611*/ + 1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/ + -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/ + }; + + int i = 0; + T a, b, k, s, t, w; + + // Short-circuits x -> +infty + if (x == one) { + return POS_INFINITY; + } + + // Short-circuits x < 1 -> NaN + if (x < one) { + return NAN; + } + + // Short-circuits negative q integers map to +infty, + // negative q non-integers map to NaN + if (q <= zero) { + if (q == floor(q)) { + return POS_INFINITY; + } + if (x != floor(x)) { + return NAN; + } + } + + s = pow(q, -x); + a = q; + i = 0; + b = zero; + while ((i < 9) || (a <= T{9.0})) { + i += 1; + a += one; + b = pow(a, -x); + s += b; + if ((-MACHEP * s < b) && (b < MACHEP * s)) { + return s; + } + }; + + w = a; + s += b * w / (x - one); + s -= half * b; + a = one; + k = zero; + for (int i = 0; i < 12; i++) { + a *= x + k; + b /= w; + t = a * b / A[i]; + s = s + t; + t = fabs(t / s); + + if (t < MACHEP) { + return s; + } + + k += one; + a *= x + k; + b /= w; + k += one; + } + + return s; + } +); // zeta_string + +const auto trigamma_string = jiterator_stringify( + template + T trigamma(T x) { + const T PI{3.14159265358979323846}; + T sign = 1; + T result = 0; + + if (x < T{0.5}) { + sign = -1; + T sin_pi_x = sin(PI * x); + result -= (PI * PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + + for (int i = 0; i < 6; ++i) { + result += T{1} / (x * x); + x += 1; + } + + const T one{1}; + const T ixx = one / (x*x); + result += (one + one / (T{2}*x) + ixx * (one/T{6} - ixx * (one/T{30} - ixx * (one/T{42})))) / x; + return sign * result; +} +); // trigamma_string + +const auto lgamma_string = jiterator_stringify( + template + T lgamma_kernel(T a) { + return lgamma(a); + } +); // lgamma_string + +const auto polygamma_string = zeta_string + jiterator_stringify( + template + T polygamma(T x, int n) { + // already blocked if n <= 1 + const auto one = T{1}; + return ((n % 2) ? one : -one) * exp(lgamma(static_cast(n) + one)) * + zeta(static_cast(n + 1), x); + } +); // polygamma_string + +const auto exp2_string = jiterator_stringify( + template + T exp2_impl(T a) { + return exp2(a); + } + + namespace std { template class complex; } + template + std::complex exp2_impl(std::complex x) { + // There is no std::exp2 overload for complex, so instead + // use the identity 2^x = e^(ln(2) * x) + const auto ln_2 = static_cast(0.693147180559945309417232121458176); + return exp(ln_2 * x); + } + + template + T exp2_kernel(T a) { + return exp2_impl(a); + } +); // exp2_string + +const auto erfc_string = jiterator_stringify( + template + T erfc_kernel(T a) { + return erfc(a); + } +); // erfc_string + +const auto erfinv_string = jiterator_stringify( + template + T erfinv_kernel(T a) { + return erfinv(a); + } +); // erfinv_string + +const auto entr_string = jiterator_stringify( + template + T entr(T a) { + if (a != a) { + return a; + } + + if (a > 0) { + return -a * log(a); + } + + if (a == 0) { + return 0; + } + + return NEG_INFINITY; + } +); // entr_string + +// NOTE: `kaiser_window_string` depends on `i0_string` +// for its implementation. +const auto i0_string = jiterator_stringify( + template + T chbevl(T x, const T array[], const int len) { + + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + T i0(T _x) { + T x = fabs(_x); + + if (x <= T{8.0}) { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T A[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + T y = (x / T{2.0}) - T{2.0}; + return exp(x) * chbevl(y, A, int{30}); + } + + // Handles x > 8 case + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + const T B[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return (exp(x) * chbevl(T{32.0} / x - T{2.0}, B, int{25})) / sqrt(x); + } +); // i0_string + +const auto i1_string = jiterator_stringify( + template + T chbevl(const T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + T i1(T _x) { + const T x = fabs(_x); + + if (x <= T{8.0}) { + // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8] + // lim(x->0){ exp(-x) i1(x) / x } = 1/2 + static const T coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const T y = x / T{2.0} - T{2.0}; + const T out = exp(x) * x * chbevl(y, coefficients, int{29}); + return (_x < T{0.0}) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval [8, infinity] + // lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi) + static const T coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + const T out = (exp(x) * chbevl(T{32.} / x - T{2.}, coefficients, int{25})) / sqrt(x); + return (_x < T{0.}) ? -out : out; + } +); // i1_string + +const auto i1e_string = jiterator_stringify( + template + T chbevl(const T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + // See double and float instantiations below + template + T i1e(T _x) { } + + // Double specialization (uses different coefficients than the float version) + template<> + double i1e(double _x) { + const double x = fabs(_x); + if (x <= double{8.}) { + // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8]. + // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2. + static const double coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const double y = x / double{2.} - double{2.}; + const double out = chbevl(y, coefficients, int{29}) * x; + return (_x < 0.) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval (8, infinity]. + // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi). + // TODO: what's an "inverted interval"? Open on the left + // and closed on the right? + static const double coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + const double out = chbevl(double{32.} / x - double{2.}, coefficients, int{25}) / sqrt(x); + return (_x < double{0.}) ? -out : out; + } + + // Float specialization (uses different coefficients than the double version) + template<> + float i1e(float _x) { + const float x = fabsf(_x); + if (x <= float{8.}) { + // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8]. + // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2. + static const float coefficients[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + const float y = x / float{2.} - float{2.}; + const float out = chbevl(y, coefficients, int{17}) * x; + return (_x < 0.) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval (8, infinity]. + // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi). + // TODO: what's an "inverted interval"? Open on the left + // and closed on the right? + static const float coefficients[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + const float out = chbevl(float{32.} / x - float{2.}, coefficients, int{7}) / sqrt(x); + return (_x < float{0.}) ? -out : out; + } +); // i1e_string + +const auto kaiser_window_string = i0_string + jiterator_stringify( + template + T kaiser_window(T a, T inv_alpha, T beta, T inv_i0_beta) { + T x = a * inv_alpha - T{1}; + T y = max(T{0}, T{1} - x * x); + return i0(beta * sqrt(y)) * inv_i0_beta; + } +); // kaiser_window_string + +const auto sinc_string = jiterator_stringify( + template + T sinc(T a) { + if (a == T(0)) { + return T(1); + } + constexpr T pi = T(3.14159265358979323846L); + T product = pi * a; + return std::sin(product) / product; + } +); // sinc_string + +const auto erfcx_string = jiterator_stringify( + /* The next function is taken from http://ab-initio.mit.edu/faddeeva */ + + /* Copyright (c) 2012 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + /* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by + Steven G. Johnson, October 2012. + + This function combines a few different ideas. + + First, for x > 50, it uses a continued-fraction expansion (same as + for the Faddeeva function, but with algebraic simplifications for z=i*x). + + Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations, + but with two twists: + + a) It maps x to y = 4 / (4+x) in [0,1]. This simple transformation, + inspired by a similar transformation in the octave-forge/specfun + erfcx by Soren Hauberg, results in much faster Chebyshev convergence + than other simple transformations I have examined. + + b) Instead of using a single Chebyshev polynomial for the entire + [0,1] y interval, we break the interval up into 100 equal + subintervals, with a switch/lookup table, and use much lower + degree Chebyshev polynomials in each subinterval. This greatly + improves performance in my tests. + + For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x), + with the usual checks for overflow etcetera. + + Performance-wise, it seems to be substantially faster than either + the SLATEC DERFC function [or an erfcx function derived there from] + or Cody's CALERF function (from netlib.org/specfun), while + retaining near machine precision in accuracy. + */ + + /* Given y100 = 100 * y, where y = 4 / (4 + x) for x >= 0, compute erfc(x). + + Uses a look-up table of 100 different Chebyshev polynomials + for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated + with the help of Maple and a little shell script. This allows + the Chebyshev polynomials to be of significantly lower degree (about 1/4) + compared to fitting the whole [0,1] interval with a single polynomial. + */ + + // TODO: review if this is computing in double when given a float input + template + T erfcx_y100(T y100) { + switch (static_cast(y100)) { + case 0: { + T t = 2*y100 - 1; + return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 1: { + T t = 2*y100 - 3; + return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 2: { + T t = 2*y100 - 5; + return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 3: { + T t = 2*y100 - 7; + return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t; + } + case 4: { + T t = 2*y100 - 9; + return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 5: { + T t = 2*y100 - 11; + return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 6: { + T t = 2*y100 - 13; + return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 7: { + T t = 2*y100 - 15; + return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t; + } + case 8: { + T t = 2*y100 - 17; + return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 9: { + T t = 2*y100 - 19; + return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 10: { + T t = 2*y100 - 21; + return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 11: { + T t = 2*y100 - 23; + return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 12: { + T t = 2*y100 - 25; + return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 13: { + T t = 2*y100 - 27; + return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 14: { + T t = 2*y100 - 29; + return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 15: { + T t = 2*y100 - 31; + return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 16: { + T t = 2*y100 - 33; + return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 17: { + T t = 2*y100 - 35; + return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 18: { + T t = 2*y100 - 37; + return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t; + } + case 19: { + T t = 2*y100 - 39; + return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 20: { + T t = 2*y100 - 41; + return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 21: { + T t = 2*y100 - 43; + return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 22: { + T t = 2*y100 - 45; + return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 23: { + T t = 2*y100 - 47; + return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 24: { + T t = 2*y100 - 49; + return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 25: { + T t = 2*y100 - 51; + return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 26: { + T t = 2*y100 - 53; + return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 27: { + T t = 2*y100 - 55; + return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 28: { + T t = 2*y100 - 57; + return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 29: { + T t = 2*y100 - 59; + return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 30: { + T t = 2*y100 - 61; + return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 31: { + T t = 2*y100 - 63; + return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 32: { + T t = 2*y100 - 65; + return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 33: { + T t = 2*y100 - 67; + return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 34: { + T t = 2*y100 - 69; + return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t; + } + case 35: { + T t = 2*y100 - 71; + return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 36: { + T t = 2*y100 - 73; + return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 37: { + T t = 2*y100 - 75; + return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 38: { + T t = 2*y100 - 77; + return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 39: { + T t = 2*y100 - 79; + return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 40: { + T t = 2*y100 - 81; + return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 41: { + T t = 2*y100 - 83; + return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 42: { + T t = 2*y100 - 85; + return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 43: { + T t = 2*y100 - 87; + return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 44: { + T t = 2*y100 - 89; + return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 45: { + T t = 2*y100 - 91; + return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 46: { + T t = 2*y100 - 93; + return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 47: { + T t = 2*y100 - 95; + return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 48: { + T t = 2*y100 - 97; + return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 49: { + T t = 2*y100 - 99; + return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 50: { + T t = 2*y100 - 101; + return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t; + } + case 51: { + T t = 2*y100 - 103; + return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 52: { + T t = 2*y100 - 105; + return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 53: { + T t = 2*y100 - 107; + return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t; + } + case 54: { + T t = 2*y100 - 109; + return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 55: { + T t = 2*y100 - 111; + return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 56: { + T t = 2*y100 - 113; + return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 57: { + T t = 2*y100 - 115; + return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 58: { + T t = 2*y100 - 117; + return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 59: { + T t = 2*y100 - 119; + return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 60: { + T t = 2*y100 - 121; + return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 61: { + T t = 2*y100 - 123; + return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 62: { + T t = 2*y100 - 125; + return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 63: { + T t = 2*y100 - 127; + return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 64: { + T t = 2*y100 - 129; + return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 65: { + T t = 2*y100 - 131; + return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 66: { + T t = 2*y100 - 133; + return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 67: { + T t = 2*y100 - 135; + return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 68: { + T t = 2*y100 - 137; + return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 69: { + T t = 2*y100 - 139; + return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 70: { + T t = 2*y100 - 141; + return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 71: { + T t = 2*y100 - 143; + return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 72: { + T t = 2*y100 - 145; + return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 73: { + T t = 2*y100 - 147; + return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t; + } + case 74: { + T t = 2*y100 - 149; + return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 75: { + T t = 2*y100 - 151; + return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 76: { + T t = 2*y100 - 153; + return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 77: { + T t = 2*y100 - 155; + return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 78: { + T t = 2*y100 - 157; + return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 79: { + T t = 2*y100 - 159; + return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 80: { + T t = 2*y100 - 161; + return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 81: { + T t = 2*y100 - 163; + return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 82: { + T t = 2*y100 - 165; + return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 83: { + T t = 2*y100 - 167; + return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 84: { + T t = 2*y100 - 169; + return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 85: { + T t = 2*y100 - 171; + return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 86: { + T t = 2*y100 - 173; + return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 87: { + T t = 2*y100 - 175; + return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 88: { + T t = 2*y100 - 177; + return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 89: { + T t = 2*y100 - 179; + return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 90: { + T t = 2*y100 - 181; + return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 91: { + T t = 2*y100 - 183; + return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 92: { + T t = 2*y100 - 185; + return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 93: { + T t = 2*y100 - 187; + return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 94: { + T t = 2*y100 - 189; + return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 95: { + T t = 2*y100 - 191; + return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 96: { + T t = 2*y100 - 193; + return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 97: { + T t = 2*y100 - 195; + return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t; + } + case 98: { + T t = 2*y100 - 197; + return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 99: { + T t = 2*y100 - 199; + return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t; + } + } + + // we only get here if y = 1, i.e. |x| < 4*eps, in which case + // erfcx is within 1e-15 of 1.. + return 1.; + } + + template + T erfcx(T x) { + // Short-circuits on NaN (returning NaN) + if (x != x) { + return x; + } + + if (x >= 0) { + if (x > T{50}) { // continued-fraction expansion is faster + const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi) + + if (x > T{5e7}) { // 1-term expansion, important to avoid overflow + return ispi / x; + } + + /* 5-term expansion (rely on compiler for CSE), simplified from: + ispi / (x+0.5/(x+1/(x+1.5/(x+2/x)))) */ + return ispi * ((x*x) * (x*x+T{4.5}) + T{2}) / (x * ((x*x) * (x*x+T{5}) + T{3.75})); + } + + // x >= 0 x <= 50 + return erfcx_y100(T{400} / (T{4} + x)); + } + + // x < 0 + if (x < T{-26.7}) { + return POS_INFINITY; + } else if (x < T{-6.1}) { + return T{2} * exp(x * x); + } + + // x < 0 and x >= -6.1 + return T{2} * exp(x * x) - erfcx_y100(T{400} / (T{4} - x)); + } +); // erfcx_string + +const auto airy_ai_string = jiterator_stringify( + template + T airy_ai_forward(T x) { + static const T AN[] = { + +3.46538101525629032477e-01, + +1.20075952739645805542e+01, + +7.62796053615234516538e+01, + +1.68089224934630576269e+02, + +1.59756391350164413639e+02, + +7.05360906840444183113e+01, + +1.40264691163389668864e+01, + +9.99999999999999995305e-01, + }; + + static const T AD[] = { + +5.67594532638770212846e-01, + +1.47562562584847203173e+01, + +8.45138970141474626562e+01, + +1.77318088145400459522e+02, + +1.64234692871529701831e+02, + +7.14778400825575695274e+01, + +1.40959135607834029598e+01, + +1.00000000000000000470e+00, + }; + + static const T AFN[] = { + -1.31696323418331795333e-01, + -6.26456544431912369773e-01, + -6.93158036036933542233e-01, + -2.79779981545119124951e-01, + -4.91900132609500318020e-02, + -4.06265923594885404393e-03, + -1.59276496239262096340e-04, + -2.77649108155232920844e-06, + -1.67787698489114633780e-08, + }; + + static const T AFD[] = { + +1.33560420706553243746e+01, + +3.26825032795224613948e+01, + +2.67367040941499554804e+01, + +9.18707402907259625840e+00, + +1.47529146771666414581e+00, + +1.15687173795188044134e-01, + +4.40291641615211203805e-03, + +7.54720348287414296618e-05, + +4.51850092970580378464e-07, + }; + + static const T AGN[] = { + +1.97339932091685679179e-02, + +3.91103029615688277255e-01, + +1.06579897599595591108e+00, + +9.39169229816650230044e-01, + +3.51465656105547619242e-01, + +6.33888919628925490927e-02, + +5.85804113048388458567e-03, + +2.82851600836737019778e-04, + +6.98793669997260967291e-06, + +8.11789239554389293311e-08, + +3.41551784765923618484e-10, + }; + + static const T AGD[] = { + +9.30892908077441974853e+00, + +1.98352928718312140417e+01, + +1.55646628932864612953e+01, + +5.47686069422975497931e+00, + +9.54293611618961883998e-01, + +8.64580826352392193095e-02, + +4.12656523824222607191e-03, + +1.01259085116509135510e-04, + +1.17166733214413521882e-06, + +4.91834570062930015649e-09, + }; + + int domain_flag = 0; + + T ai; + + if (isinf(x)) { + return NAN; + } + + if (x > T(103.892)) { + return T(0.0); + } + + T f; + T g; + T k; + + if (x < T(-2.09)) { + T z = T(1.0) / (T(-2.0) * x * sqrt(-x) / T(3.0)); + + T afn = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afn = afn * (z * z) + AFN[index]; + } + + T afd = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afd = afd * (z * z) + AFD[index]; + } + + T agn = 0.0; + + for (uint8_t index = 0; index <= 10 + 0; index++) { + agn = agn * (z * z) + AGN[index]; + } + + T agd = 0.0; + + for (uint8_t index = 0; index <= 10 - 1; index++) { + agd = agd * (z * z) + AGD[index]; + } + + T t = T(-2.0) * x * sqrt(-x) / T(3.0) + T(0.25) * T(3.14159265358979323846); + + return T(5.64189583547756286948e-01) / sqrt(sqrt(-x)) * (sin(t) * (T(1.0) + z * z * afn / afd) - cos(t) * (z * agn / agd)); + } + + if (x >= T(2.09)) { + domain_flag = 5; + + T zeta = T(2.0) * x * sqrt(x) / T(3.0); + + T an = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + an = an * (T(1.0) / zeta) + AN[index]; + } + + T ad = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + ad = ad * (T(1.0) / zeta) + AD[index]; + } + + ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * sqrt(sqrt(x)) * exp(zeta)); + + if (x > T(8.3203353)) { + return ai; + } + } + + f = 1.0; + g = x; + k = 1.0; + + T m = 1.0; + T n = x; + T t = 1.0; + T z = x * x * x; + + while (t > T(1.11022302462515654042e-16)) { + m *= z; + k += T(1.0); + m /= k; + n *= z; + k += T(1.0); + n /= k; + m /= k; + f += m; + k += T(1.0); + n /= k; + g += n; + + t = abs(m / f); + } + + if ((domain_flag & 1) == 0) { + return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g; + } + + return ai; + } // T airy_ai(T x) +); // airy_ai_string + +const auto bessel_j0_string = jiterator_stringify( + template + T bessel_j0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T RP[] = { + -4.79443220978201773821e+09, + +1.95617491946556577543e+12, + -2.49248344360967716204e+14, + +9.70862251047306323952e+15, + }; + + static const T RQ[] = { + +4.99563147152651017219e+02, + +1.73785401676374683123e+05, + +4.84409658339962045305e+07, + +1.11855537045356834862e+10, + +2.11277520115489217587e+12, + +3.10518229857422583814e+14, + +3.18121955943204943306e+16, + +1.71086294081043136091e+18, + }; + + if (x < T(0)) { + x = -x; + } + + if (x <= T(5.0)) { + if (x < T(0.00001)) { + return T(1.0) - x * x / T(4.0); + } + + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq; + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_j0_forward(T x) +); // bessel_j0_string + +const auto bessel_y0_string = bessel_j0_string + jiterator_stringify( + template + T bessel_y0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T YP[] = { + +1.55924367855235737965e+04, + -1.46639295903971606143e+07, + +5.43526477051876500413e+09, + -9.82136065717911466409e+11, + +8.75906394395366999549e+13, + -3.46628303384729719441e+15, + +4.42733268572569800351e+16, + -1.84950800436986690637e+16, + }; + + static const T YQ[] = { + +1.04128353664259848412e+03, + +6.26107330137134956842e+05, + +2.68919633393814121987e+08, + +8.64002487103935000337e+10, + +2.02979612750105546709e+13, + +3.17157752842975028269e+15, + +2.50596256172653059228e+17, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return NEG_INFINITY; + } + + if (x < T(0.0)) { + NAN; + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return yp / yq + (T(0.636619772367581343075535053490057448) * log(x) * bessel_j0_forward(x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_y0_forward(T x) +); // bessel_y0_string + +const auto bessel_j1_string = jiterator_stringify( + template + T bessel_j1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T RP[] = { + -8.99971225705559398224e+08, + +4.52228297998194034323e+11, + -7.27494245221818276015e+13, + +3.68295732863852883286e+15, + }; + + static const T RQ[] = { + +6.20836478118054335476e+02, + +2.56987256757748830383e+05, + +8.35146791431949253037e+07, + +2.21511595479792499675e+10, + +4.74914122079991414898e+12, + +7.84369607876235854894e+14, + +8.95222336184627338078e+16, + +5.32278620332680085395e+18, + }; + + if (x < T(0.0)) { + return -bessel_j1_forward(-x); + } + + if (x <= T(5.0)) { + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_j1_forward(T x) +); // bessel_j1_string + +const auto bessel_y1_string = bessel_j1_string + jiterator_stringify( + template + T bessel_y1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T YP[] = { + +1.26320474790178026440e+09, + -6.47355876379160291031e+11, + +1.14509511541823727583e+14, + -8.12770255501325109621e+15, + +2.02439475713594898196e+17, + -7.78877196265950026825e+17, + }; + + static const T YQ[] = { + +5.94301592346128195359e+02, + +2.35564092943068577943e+05, + +7.34811944459721705660e+07, + +1.87601316108706159478e+10, + +3.88231277496238566008e+12, + +6.20557727146953693363e+14, + +6.87141087355300489866e+16, + +3.97270608116560655612e+18, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return NEG_INFINITY; + } + + if (x <= T(0.0)) { + return NAN; + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 5; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * log(x) - T(1.0) / x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_y1_forward(T x) +); // bessel_y1_string + +const auto chebyshev_polynomial_t_string = jiterator_stringify( + template + T chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (abs(x) < T(1.0))) { + return cos(n * acos(x)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_t_forward(T x, int64_t n) + + template + T chebyshev_polynomial_t_forward(T x, T n) { + return chebyshev_polynomial_t_forward(x, static_cast(n)); + } // chebyshev_polynomial_t_forward(T x, T n) +); // chebyshev_polynomial_t_string + +const auto chebyshev_polynomial_u_string = jiterator_stringify( + template + T chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (sin(acos(x)) != T(0.0)) { + return sin((n + 1) * acos(x)) / sin(acos(x)); + } + + return (n + 1) * cos((n + 1) * acos(x)) / x; + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + T p = T(1.0); + T q = x + x; + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_u_forward(T x, int64_t n) + + template + T chebyshev_polynomial_u_forward(T x, T n) { + return chebyshev_polynomial_u_forward(x, static_cast(n)); + } // chebyshev_polynomial_u_forward(T x, T n) +); // chebyshev_polynomial_u_string + +const auto chebyshev_polynomial_v_string = jiterator_stringify( + template + T chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0)) { + return T(1.0); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (sin(acos(x) / T(2.0)) != T(1.0)) { + return cos((n + T(0.5)) * acos(x)) / cos(acos(x) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_v_forward(T x, int64_t n) + + template + T chebyshev_polynomial_v_forward(T x, T n) { + return chebyshev_polynomial_v_forward(x, static_cast(n)); + } // chebyshev_polynomial_v_forward(T x, T n) +); // chebyshev_polynomial_v_string + +const auto chebyshev_polynomial_w_string = jiterator_stringify( + template + T chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (cos(acos(x) / T(2.0)) != T(1.0)) { + return sin((n + T(0.5)) * acos(x)) / sin(acos(x) / T(2.0)); + } + + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x + T(1.0); + } + + T p = T(1.0); + T q = x + x + T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_w_forward(T x, int64_t n) + + template + T chebyshev_polynomial_w_forward(T x, T n) { + return chebyshev_polynomial_w_forward(x, static_cast(n)); + } // chebyshev_polynomial_w_forward(T x, T n) +); // chebyshev_polynomial_w_string + +const auto hermite_polynomial_h_string = jiterator_stringify( + template + unsigned short getHermitianLimit() { + if (sizeof(T) <= sizeof(float)) { + return 128; + } else if (sizeof(T) <= sizeof(double)) { + return 512; + } else { + return 1024; + } + } + + template + T hermite_polynomial_h_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + if (n > getHermitianLimit()) { + return NAN; + } + + T p = T(1.0); + T q = x + x; + T r = T(0.0); + + for (int64_t k = 2; k < n + n; k += 2) { + r = (x + x) * q - k * p; + p = q; + q = r; + } + + return r; + } // hermite_polynomial_h_forward(T x, int64_t n) + + template + T hermite_polynomial_h_forward(T x, T n) { + return hermite_polynomial_h_forward(x, static_cast(n)); + } // hermite_polynomial_h_forward(T x, T n) +); // hermite_polynomial_h_string + +const auto hermite_polynomial_he_string = jiterator_stringify( + template + unsigned short getHermitianLimit() { + if (sizeof(T) <= sizeof(float)) { + return 128; + } else if (sizeof(T) <= sizeof(double)) { + return 512; + } else { + return 1024; + } + } + + template + T hermite_polynomial_he_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + if (n > getHermitianLimit()) { + return NAN; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; k < n; k++) { + r = x * q - k * p; + p = q; + q = r; + } + + return r; + } // hermite_polynomial_he_forward(T x, int64_t n) + + template + T hermite_polynomial_he_forward(T x, T n) { + return hermite_polynomial_he_forward(x, static_cast(n)); + } // hermite_polynomial_he_forward(T x, T n) +); // hermite_polynomial_he_string + +const auto laguerre_polynomial_l_string = jiterator_stringify( + template + T laguerre_polynomial_l_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(0.0)) { + return T(1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return T(1.0) - x; + } + + T p = T(1.0); + T q = T(1.0) - x; + T r; + + for (int64_t k = 1; (k < n) && !isnan(q); k++) { + r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; + } // laguerre_polynomial_l_forward(T x, int64_t n) + + template + T laguerre_polynomial_l_forward(T x, T n) { + return laguerre_polynomial_l_forward(x, static_cast(n)); + } // laguerre_polynomial_l_forward(T x, T n) +); // laguerre_polynomial_l_string + +const auto legendre_polynomial_p_string = jiterator_stringify( + template + T legendre_polynomial_p_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; (k < n) && !isnan(q); k++) { + r = ((k + k + 1) * x * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; + } // legendre_polynomial_p_forward(T x, int64_t n) + + template + T legendre_polynomial_p_forward(T x, T n) { + return legendre_polynomial_p_forward(x, static_cast(n)); + } // legendre_polynomial_p_forward(T x, T n) +); // legendre_polynomial_p_string + +const auto modified_bessel_i0_string = jiterator_stringify( + template + T modified_bessel_i0_forward(T x) { + static const T A[] = { + -4.41534164647933937950e-18, + +3.33079451882223809783e-17, + -2.43127984654795469359e-16, + +1.71539128555513303061e-15, + -1.16853328779934516808e-14, + +7.67618549860493561688e-14, + -4.85644678311192946090e-13, + +2.95505266312963983461e-12, + -1.72682629144155570723e-11, + +9.67580903537323691224e-11, + -5.18979560163526290666e-10, + +2.65982372468238665035e-09, + -1.30002500998624804212e-08, + +6.04699502254191894932e-08, + -2.67079385394061173391e-07, + +1.11738753912010371815e-06, + -4.41673835845875056359e-06, + +1.64484480707288970893e-05, + -5.75419501008210370398e-05, + +1.88502885095841655729e-04, + -5.76375574538582365885e-04, + +1.63947561694133579842e-03, + -4.32430999505057594430e-03, + +1.05464603945949983183e-02, + -2.37374148058994688156e-02, + +4.93052842396707084878e-02, + -9.49010970480476444210e-02, + +1.71620901522208775349e-01, + -3.04682672343198398683e-01, + +6.76795274409476084995e-01, + }; + + static const T B[] = { + -7.23318048787475395456e-18, + -4.83050448594418207126e-18, + +4.46562142029675999901e-17, + +3.46122286769746109310e-17, + -2.82762398051658348494e-16, + -3.42548561967721913462e-16, + +1.77256013305652638360e-15, + +3.81168066935262242075e-15, + -9.55484669882830764870e-15, + -4.15056934728722208663e-14, + +1.54008621752140982691e-14, + +3.85277838274214270114e-13, + +7.18012445138366623367e-13, + -1.79417853150680611778e-12, + -1.32158118404477131188e-11, + -3.14991652796324136454e-11, + +1.18891471078464383424e-11, + +4.94060238822496958910e-10, + +3.39623202570838634515e-09, + +2.26666899049817806459e-08, + +2.04891858946906374183e-07, + +2.89137052083475648297e-06, + +6.88975834691682398426e-05, + +3.36911647825569408990e-03, + +8.04490411014108831608e-01, + }; + + T p; + T q = 0.0; + + if (abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 30; index++) { + p = q; + q = a; + a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + return exp(abs(x)) * (T(0.5) * (a - p)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index]; + } + + return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)); + } // modified_bessel_i0_forward(T x) +); // modified_bessel_i0_string + +const auto modified_bessel_i1_string = jiterator_stringify( + template + T modified_bessel_i1_forward(T x) { + static const T A[] = { + +2.77791411276104639959e-18, + -2.11142121435816608115e-17, + +1.55363195773620046921e-16, + -1.10559694773538630805e-15, + +7.60068429473540693410e-15, + -5.04218550472791168711e-14, + +3.22379336594557470981e-13, + -1.98397439776494371520e-12, + +1.17361862988909016308e-11, + -6.66348972350202774223e-11, + +3.62559028155211703701e-10, + -1.88724975172282928790e-09, + +9.38153738649577178388e-09, + -4.44505912879632808065e-08, + +2.00329475355213526229e-07, + -8.56872026469545474066e-07, + +3.47025130813767847674e-06, + -1.32731636560394358279e-05, + +4.78156510755005422638e-05, + -1.61760815825896745588e-04, + +5.12285956168575772895e-04, + -1.51357245063125314899e-03, + +4.15642294431288815669e-03, + -1.05640848946261981558e-02, + +2.47264490306265168283e-02, + -5.29459812080949914269e-02, + +1.02643658689847095384e-01, + -1.76416518357834055153e-01, + +2.52587186443633654823e-01, + }; + + static const T B[] = { + +7.51729631084210481353e-18, + +4.41434832307170791151e-18, + -4.65030536848935832153e-17, + -3.20952592199342395980e-17, + +2.96262899764595013876e-16, + +3.30820231092092828324e-16, + -1.88035477551078244854e-15, + -3.81440307243700780478e-15, + +1.04202769841288027642e-14, + +4.27244001671195135429e-14, + -2.10154184277266431302e-14, + -4.08355111109219731823e-13, + -7.19855177624590851209e-13, + +2.03562854414708950722e-12, + +1.41258074366137813316e-11, + +3.25260358301548823856e-11, + -1.89749581235054123450e-11, + -5.58974346219658380687e-10, + -3.83538038596423702205e-09, + -2.63146884688951950684e-08, + -2.51223623787020892529e-07, + -3.88256480887769039346e-06, + -1.10588938762623716291e-04, + -9.76109749136146840777e-03, + +7.78576235018280120474e-01, + }; + + T p; + T q = 0.0; + + if (abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 29; index++) { + p = q; + q = a; + a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + if (x < T(0.0)) { + return -(T(0.5) * (a - p) * abs(x) * exp(abs(x))); + } + + return T(0.5) * (a - p) * abs(x) * exp(abs(x)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index]; + } + + if (x < T(0.0)) { + return -(exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x))); + } + + return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)); + } // modified_bessel_i1_forward(T x) +); // modified_bessel_i1_string + +const auto modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify( + template + T modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return T(0.5) * (a - p) - log(0.5 * x) * modified_bessel_i0_forward(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return exp(-x) * (T(0.5) * (b - p)) / sqrt(x); + } // modified_bessel_k0_forward(T x) +); // modified_bessel_k0_string + +const auto scaled_modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify( + template + T scaled_modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (T(0.5) * (a - p) - log(T(0.5) * x) * modified_bessel_i0_forward(x)) * exp(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return T(0.5) * (b - p) / sqrt(x); + } // T scaled_modified_bessel_k0_forward(T x) +); // scaled_modified_bessel_k0_string + +const auto modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify( + template + T modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x; + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return exp(-x) * (T(0.5) * (b - p)) / sqrt(x); + } // modified_bessel_k1_forward(T x) +); // modified_bessel_k1_string + +const auto scaled_modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify( + template + T scaled_modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * exp(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return (T(0.5) * (b - p) / sqrt(x)); + } // T scaled_modified_bessel_k1_forward(T x) +); // scaled_modified_bessel_k1_string + +const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + return cos(n * acos(x + x - T(1.0))); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_t_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_t_forward(T x, T n) { + return shifted_chebyshev_polynomial_t_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_t_forward(T x, T n) +); // shifted_chebyshev_polynomial_t_string + +const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + if (sin(acos(x + x - T(1.0))) != T(0.0)) { + return sin((n + 1) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0))); + } + + return (n + 1) * cos((n + 1) * acos(x + x - T(1.0))) / (x + x - T(1.0)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_u_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_u_forward(T x, T n) { + return shifted_chebyshev_polynomial_u_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_u_forward(T x, T n) +); // shifted_chebyshev_polynomial_u_string + +const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return (n + n + 1); + } + + return -(n + n + 1); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + if (sin(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return cos(((n) + T(0.5)) * acos(x + x - T(1.0))) / cos(acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_v_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_v_forward(T x, T n) { + return shifted_chebyshev_polynomial_v_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_v_forward(T x, T n) +); // shifted_chebyshev_polynomial_v_string + +const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 4) && (abs(x + x - T(1.0)) < T(1.0))) { + if (cos(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return sin((n + T(0.5)) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_w_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_w_forward(T x, T n) { + return shifted_chebyshev_polynomial_w_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_w_forward(T x, T n) +); // shifted_chebyshev_polynomial_w_string + +const auto spherical_bessel_j0_string = jiterator_stringify( + template + T spherical_bessel_j0_forward(T x) { + if (isinf(x)) { + return T(0.0); + } + + if (abs(x) < T(0.5)) { + return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0))))))); + } + + return sin(x) / x; + } // T spherical_bessel_j0_forward(T x) +); // spherical_bessel_j0_string + +#else // !AT_USE_JITERATOR() -- kernels must be precompiled + +template +static inline C10_HOST_DEVICE scalar_t calc_gcd(scalar_t a_in, scalar_t b_in) { + scalar_t a = ::abs(a_in); + scalar_t b = ::abs(b_in); + while (a != 0) { + scalar_t c = a; + a = b % a; + b = c; + } + return b; +} + +/* + * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) { + // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma + using accscalar_t = at::acc_type; + static constexpr double PI_f64 = 3.14159265358979323846; + constexpr accscalar_t PSI_10 = 2.25175258906672110764; + constexpr accscalar_t A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + accscalar_t x = static_cast(in); + if (x == 0) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is ±0, ±∞ is returned + return std::copysign(static_cast(INFINITY), -x); + } + + bool x_is_integer = x == ::trunc(x); + accscalar_t result = 0; + if (x < 0) { + if (x_is_integer) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is a negative integer, NaN is returned + return static_cast(NAN); + } + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = ::modf(static_cast(x), &q); + result = static_cast(- PI_f64 / ::tan(PI_f64 * r)); + x = 1 - x; + } + + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return static_cast(result + PSI_10); + } + + accscalar_t y = 0; + if (x < 1.0e17) { + accscalar_t z = 1 / (x * x); + + accscalar_t polevl_result = 0; + for (int i = 0; i <= 6; i++) { + polevl_result = polevl_result * z + A[i]; + } + y = z * polevl_result; + } + + return static_cast(::log(x) - (static_cast(0.5) / x) - y + result); +} + +template +static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) { + using accscalar_t = at::acc_type; + const accscalar_t PI = 3.14159265358979323846; + accscalar_t x = static_cast(in); + accscalar_t sign = +1; + accscalar_t result = 0; + if (x < 0.5f) { + sign = -1; + accscalar_t sin_pi_x = ::sin(PI * x); + result -= (PI * PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + for (int i = 0; i < 6; ++i) { + result += 1 / (x * x); + x += 1; + } + const accscalar_t one = static_cast(1); + const accscalar_t ixx = 1 / (x*x); + result += (1 + 1 / (2*x) + ixx * (one/6 - ixx * (one/30 - ixx * (one/42)))) / x; + return static_cast(sign * result); +} + +/* + * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +static inline C10_HOST_DEVICE scalar_t +chbevl(scalar_t _x, const scalar_t array[], size_t len) { + static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + + scalar_t b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (size_t i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = _x * b1 - b2 + array[i]; + } + + return (0.5 * (b0 - b2)); +} + +/* + * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_A() { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T coefficients[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + return std::make_tuple(coefficients, 30); +} + +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + static const T coefficients[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return std::make_tuple(coefficients, 25); +} + +template +static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) { + static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + // Upcast input for numerical accuracy purposes + // Needed for accurate results if input is bfloat16 or float16 + scalar_t x = ::abs(_x); + + if (x <= scalar_t{8.0}) { + auto [A, len] = chebyshev_coefficients_i0e_A(); + scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0}; + return (::exp(x) * chbevl(y, A, len)); + } + + auto [B, len] = chebyshev_coefficients_i0e_B(); + return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x)); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if_t, std::tuple> + chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + + return std::make_tuple(coefficients, 29); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if_t, std::tuple> + chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coeff[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + return std::make_tuple(coeff, 17); +}; + +template +C10_HOST_DEVICE inline + typename std::enable_if_t, std::tuple> + chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + return std::make_tuple(coefficients, 25); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if_t, std::tuple> + chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coeff[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + return std::make_tuple(coeff, 7); +}; + +template +static inline C10_HOST_DEVICE scalar_t calc_i1(scalar_t _x) { + const auto x = ::abs(_x); + if (x <= scalar_t{8.0}) { + auto [A, len] = chebyshev_coefficients_i1e_A(); + scalar_t y = x / scalar_t{2.0} - scalar_t{2.0}; + const scalar_t out = ::exp(x) * x * chbevl(y, A, len); + return (_x < scalar_t{0.0}) ? -out : out; + } + + auto [B, len] = chebyshev_coefficients_i1e_B(); + const scalar_t out = (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len)) / ::sqrt(x); + return (_x < scalar_t{0.0}) ? -out : out; +} + +template +static inline C10_HOST_DEVICE scalar_t calc_i1e(scalar_t _x) { + const auto x = ::abs(_x); + if (x <= scalar_t{8.0}) { + auto [A, len] = chebyshev_coefficients_i1e_A(); + const scalar_t y = x / scalar_t{2.0} - scalar_t{2.0}; + const scalar_t out = chbevl(y, A, len) * x; + return (_x < scalar_t{0.0}) ? -out : out; + } + + auto [B, len] = chebyshev_coefficients_i1e_B(); + const scalar_t out = chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x); + return (_x < scalar_t{0.0}) ? -out : out; +} + +#endif // AT_USE_JITERATOR() (this closes the "else" branch of a if/else preprocessor directive) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh new file mode 100644 index 0000000000000000000000000000000000000000..46b93efea99c145881a29ce2e70df76b06eaf8ef --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh @@ -0,0 +1,687 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// References: +// https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access/ + +namespace at::native::memory { + +namespace detail { + +// What does the `static_unroll` do? +// +// We want to do something like: +// +// using args_t = typename traits::ArgsTuple; +// args_t args; +// #pragma unroll +// for (int i = 0; i < traits::arity; i++) { +// std::get(args) = .... +// } +// +// but unfortunately the above code does not work because +// the template argument has to be a compile time constant +// so `static_unroll` is created to simulate `#pragma unroll` +// using template metaprogramming. + +template typename func, int end, int current=0> +struct static_unroll { + template + static inline C10_HOST_DEVICE void with_args(Args&&... args) { + func::apply(std::forward(args)...); + static_unroll::with_args(args...); + } +}; + +template typename func, int end> +struct static_unroll { + template + static inline C10_HOST_DEVICE void with_args(Args... /*args*/) {} +}; + +// helper structs to be used with static_unroll to load arguments +// one by one + +template +struct vectorized_load_helper { + template + static __device__ void apply(policy_t &self, args_t *args, int idx, int block_work_size) { + using arg_t = std::tuple_element_t; + // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + auto ptr = reinterpret_cast(self.data[arg_index + 1]) + block_work_size * idx; + auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get(args[thread_unroll_idx]); }; + self.load_single_arg(args_accessor, ptr); + } +}; + +#ifdef USE_ROCM +// Templated version of vectorized load helper. +// It can be used on heterogeneous input tensor element types. +template +struct vectorized_templated_load_helper { + template + static __device__ void apply(policy_t& self, args_t* args, int idx) { + using arg_t = std::tuple_element_t; + // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + + // Delay pointer arithmetic to the policy loader where we know the actual + // type of the current argument. + char* ptr = (self.data[arg_index + 1]); + auto args_accessor = [&args] __device__(int thread_unroll_idx) -> arg_t& { + return std::get(args[thread_unroll_idx]); + }; + self.template load_single_arg(args_accessor, ptr, idx); + } +}; +#endif + +template +struct unroll_load_helper { + template + static __device__ void apply(policy_t &self, args_t *args, offset_t offset, loader_t loader, int j, int num_outputs) { + using arg_t = std::tuple_element_t; + // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + std::get(args[j]) = loader.template load(self.data[arg_index + num_outputs], offset[arg_index], arg_index); + } +}; + +template +struct multi_outputs_store_helper { + template + C10_HOST_DEVICE static void apply( + const data_t& data, + const offsets_t& offsets, + thrust::tuple ret) { + using T = typename thrust::tuple_element>::type; + T *to = reinterpret_cast(data[current]) + offsets[current]; + *to = thrust::get(ret); + } +}; + +} // namespace detail + +struct LoadWithoutCast { + template + __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) { + return c10::load(reinterpret_cast(base_ptr) + offset); + } +}; + +template +struct LoadWithCast { + using array_t = std::array(N, 1)>; + using size_array_t = std::array(N, 1)>; + + array_t dtypes; + size_array_t element_sizes; + + LoadWithCast(const TensorIteratorBase& iter) { + CUDA_KERNEL_ASSERT(iter.ninputs() == N); + #pragma unroll + for (auto i = 0; i < N; ++i) { + this->dtypes[i] = iter.dtype(i + iter.noutputs()); + element_sizes[i] = c10::elementSize(iter.dtype(i + iter.noutputs())); + } + } + + template + __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) { + void *ptr = base_ptr + element_sizes[arg] * offset; + return c10::fetch_and_cast(dtypes[arg], ptr); + } +}; + +struct StoreWithoutCast { + template + __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) { + *(reinterpret_cast(base_ptr) + offset) = value; + } +}; + +template +struct StoreWithCast { + using array_t = std::array(N, 1)>; + using size_array_t = std::array(N, 1)>; + + array_t dtypes; + size_array_t element_sizes; + + StoreWithCast(const TensorIteratorBase& iter) { + CUDA_KERNEL_ASSERT(iter.noutputs() == N); + #pragma unroll + for (auto i = 0; i < N; ++i) { + this->dtypes[i] = iter.dtype(i); + element_sizes[i] = c10::elementSize(iter.dtype(i)); + } + } + + template + __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) { + void *ptr = base_ptr + element_sizes[arg] * offset; + c10::cast_and_store(dtypes[arg], ptr, value); + } +}; + +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { + scalar_t val[vec_size]; +}; + +template +__device__ aligned_vector load_vector(const scalar_t *base_ptr, uint32_t offset) { + using vec_t = aligned_vector; + auto *from = reinterpret_cast(base_ptr); +#if defined(USE_ROCM) && defined(__gfx942__) + using longx2 = __attribute__((__vector_size__(4*sizeof(int)))) int; + if constexpr (sizeof(vec_t) == sizeof(int)) { + union { + vec_t v; + int i; + } tmpt = { .i = __builtin_nontemporal_load(reinterpret_cast(&(from[offset]))) }; + return tmpt.v; + } + else if constexpr (sizeof(vec_t) == sizeof(long)) { + union { + vec_t v; + long i; + } tmpt = { .i = __builtin_nontemporal_load(reinterpret_cast(&(from[offset]))) }; + return tmpt.v; + } + else if constexpr (sizeof(vec_t) == sizeof(longx2)) { + union { + vec_t v; + longx2 i; + } tmpt = { .i = __builtin_nontemporal_load(reinterpret_cast(&(from[offset]))) }; + return tmpt.v; + } +#endif + return from[offset]; +} + +template +__device__ aligned_vector load_vector(const bool *base_ptr, uint32_t offset) { + // See NOTE [Loading boolean values] + auto tmp = load_vector(reinterpret_cast(base_ptr), offset); + aligned_vector ret; + for (int i = 0; i < vec_size; ++i) { + ret.val[i] = bool(tmp.val[i]); + } + return ret; +} + +namespace policies { + +template < + int num_threads, + typename data_t, + typename inp_calc_t, + typename out_calc_t, + typename loader_t, + typename storer_t, + int elems_per_thread, + int num_outputs = 1> +struct unroll_base { + data_t data; + int remaining; + inp_calc_t input_offset_calculator; + out_calc_t output_offset_calculator; + loader_t loader; + storer_t storer; + static constexpr int tws = elems_per_thread; + static constexpr int block_work_size = elems_per_thread * num_threads; + + __device__ unroll_base( + data_t data, + int remaining, + inp_calc_t ic, + out_calc_t oc, + loader_t l, + storer_t s) + : data(data), + remaining(remaining), + input_offset_calculator(ic), + output_offset_calculator(oc), + loader(l), + storer(s) {} + + __device__ inline bool check_inbounds(int thread_work_elem) { + return ((int)(threadIdx.x + thread_work_elem * num_threads) < remaining); + } + + template + __device__ inline void load(args_t *args, int idx) { + constexpr int arity = std::tuple_size_v; + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < elems_per_thread; i++) { + if (thread_idx < remaining) { + int linear_idx = thread_idx + block_work_size * idx; + auto offset = input_offset_calculator.get(linear_idx); + detail::static_unroll::with_args( + *this, args, offset, loader, i, num_outputs); + thread_idx += num_threads; + } + } + } + + template + __device__ inline void store(scalar_t *from, int idx) { + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < elems_per_thread; i++) { + if (thread_idx < remaining) { + int linear_idx = thread_idx + block_work_size * idx; + int offset = output_offset_calculator.get(linear_idx)[0]; + storer.store(from[i], data[0], offset); + thread_idx += num_threads; + } + } + } +}; + +// Utility type for all users of unroll that extract the num_threads value from +// the caller scope. +template < + typename data_t, + typename inp_calc_t, + typename out_calc_t, + typename loader_t, + typename storer_t, + int elems_per_thread, + int num_outputs = 1> +using unroll = unroll_base< + num_threads(), + data_t, + inp_calc_t, + out_calc_t, + loader_t, + storer_t, + elems_per_thread, + num_outputs>; + +template // vec_size: number of scalars, can be 1, 2, or 4. +struct vectorized { + + static_assert(elems_per_thread % vec_size == 0, "The workload per thread must be a multiple of vec_size"); + static constexpr int loop_size = elems_per_thread / vec_size; + static constexpr int tws = elems_per_thread; + + data_t data; + + __device__ vectorized(data_t data) : data(data) {} + + __device__ inline constexpr bool check_inbounds(int thread_work_elem) { + return true; + } + + template + __device__ inline void load_single_arg(accessor_t to, scalar_t *from) { + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads(); + auto v = load_vector(from, index); + #pragma unroll + for (int j = 0; j < vec_size; j++) { + to(vec_size * i + j) = v.val[j]; + } + } + } + + template + __device__ inline void load(args_t *args, int idx) { + constexpr int arity = std::tuple_size_v; + detail::static_unroll::with_args(*this, args, idx, elems_per_thread * num_threads()); + } + + template + __device__ inline void store(scalar_t *from, int idx) { + using vec_t = aligned_vector; + scalar_t *to = reinterpret_cast(data[0]) + elems_per_thread * num_threads() * idx; + vec_t *to_ = reinterpret_cast(to); + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads(); + vec_t v; + for (int j = 0; j < vec_size; j++) { + v.val[j] = from[vec_size * i + j]; + } + to_[index] = v; + } + } +}; + +#ifdef USE_ROCM +// This is similar to vectorized policy above, but this one supports +// heterogeneous input tensor types as templated parameters. +// Its use should be limited to frequently used heterogeneous data types +// as each instantiation will generate a separate kernel, leading to code +// bloating if applied to all combinations supported in PyTorch. Assumption: all +// tensors are contiguous, that is: stride == sizeof(type) for all tensors. +template < + int vec_size, + typename data_t, + int elems_per_thread, + int num_threads, + typename CastToT, + typename... CastFromTs> // vec_size: number of scalars, can be 1, 2, or 4. +struct vectorized_templated { + static_assert( + elems_per_thread % vec_size == 0, + "The workload per thread must be a multiple of vec_size"); + static constexpr int loop_size = elems_per_thread / vec_size; + static constexpr int tws = elems_per_thread; + static constexpr int block_work_size = elems_per_thread * num_threads; + data_t data; + + __device__ vectorized_templated(data_t data) : data(data) {} + + __device__ inline constexpr bool check_inbounds(int thread_work_elem) { + return true; + } + + template + __device__ inline void load_single_arg(accessor_t to, char* ptr, int idx) { + // extract the arg_index-th input tensor element type from the + // variadic template argument. + using CastFromT = + std::tuple_element_t>; + // Delayed pointer arithmetic from the caller: this is the place + // where we know the type of the argument. + CastFromT* block_ptr = + reinterpret_cast(ptr) + block_work_size * idx; + int thread_idx = threadIdx.x; +#pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads; + auto v = load_vector(block_ptr, index); +#pragma unroll + for (int j = 0; j < vec_size; j++) { + to(vec_size * i + j) = c10::convert(v.val[j]); + } + } + } + + template + __device__ inline void load(args_t* args, int idx) { + constexpr int arity = std::tuple_size::value; + detail::static_unroll:: + with_args(*this, args, idx); + } + + // Assume for now that from (temporary array per thread) is of the same + // type as to (destination tensor), which is the case for + // float(float,bfloat16) and functor add on float(float,float). + template + __device__ inline void store(scalar_t* from, int idx) { + using vec_t = aligned_vector; + CastToT* to = reinterpret_cast(data[0]) + block_work_size * idx; + vec_t* to_ = reinterpret_cast(to); + int thread_idx = threadIdx.x; +#pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads; + vec_t v; + for (int j = 0; j < vec_size; j++) { + v.val[j] = from[vec_size * i + j]; + } + to_[index] = v; + } + } +}; +#endif + +template +struct multi_outputs_unroll { + //multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct + //we don't use inheritance because of compiler bug in cuda 10.2+ + data_t data; + int remaining; + inp_calc_t input_offset_calculator; + out_calc_t output_offset_calculator; + LoadWithoutCast loader; + StoreWithoutCast storer; + static constexpr int tws = thread_work_size(); + + __device__ multi_outputs_unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc): + data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {} + + __device__ inline bool check_inbounds(int thread_work_elem) { + return ((int)(threadIdx.x + thread_work_elem*num_threads()) < remaining); + } + + template + __device__ inline void load(args_t *args, int idx) { + constexpr int arity = std::tuple_size_v; + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + return; + } + int linear_idx = thread_idx + block_work_size() * idx; + auto offset = input_offset_calculator.get(linear_idx); + detail::static_unroll::with_args(*this, args, offset, loader, i, num_outputs); + thread_idx += num_threads(); + } + } + + + template + __device__ inline void store(return_t *from, int idx) { + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= this->remaining) { + return; + } + int linear_idx = thread_idx + block_work_size() * idx; + auto offsets = this->output_offset_calculator.get(linear_idx); + memory::detail::static_unroll::with_args(this->data, offsets, from[i]); + thread_idx += num_threads(); + } + } +}; + +} // namespace policies + +// This is only used in host, but we will wrap this into some templates +// which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE +// in order to compile +template +inline C10_HOST_DEVICE int can_vectorize_up_to(const char *pointer) { + uint64_t address = reinterpret_cast(pointer); + constexpr int vec2_alignment = std::alignment_of_v>; + constexpr int vec4_alignment = std::alignment_of_v>; + constexpr int vec8_alignment = std::alignment_of_v>; +#ifdef USE_ROCM + constexpr int vec16_alignment = std::alignment_of_v>; + constexpr int type_size = sizeof(scalar_t); + if (type_size == 1 && (address % vec16_alignment == 0)) { + return 16; + } else if (type_size <= 2 && (address % vec8_alignment == 0)) { + return 8; + } else +#else + if (address % vec8_alignment == 0) { + return 8; + } else +#endif + if (address % vec4_alignment == 0) { + return 4; + } else if (address % vec2_alignment == 0) { + return 2; + } + return 1; +} + +template +inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) { + return can_vectorize_up_to(static_cast(pointer)); +} + +template +struct can_vectorize_up_to_helper { + template + static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits /*_*/) { + using arg_t = typename traits::template arg::type; + // `pointers` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + result = std::min(result, can_vectorize_up_to(pointers[i + 1])); + } +}; + +template +inline int can_vectorize_up_to(array_t pointers) { + using traits = function_traits; + using return_t = typename traits::result_type; + constexpr int arity = traits::arity; + int result = can_vectorize_up_to(pointers[0]); + // We need to get the type for each argument of `func_t`, this can only + // be done at compile time. + detail::static_unroll::with_args(result, pointers, traits()); + return result; +} + + + +template +__inline__ size_t get_alignment(T ptr_or_size) { + auto val = reinterpret_cast(ptr_or_size); + if (val % 16 == 0) { + return 16; + } else if (val % 8 == 0) { + return 8; + } else if (val % 4 == 0) { + return 4; + } else if (val % 2 == 0) { + return 2; + } else { + return 1; + } +} + +template <> +__inline__ size_t get_alignment(size_t size) { + return get_alignment(reinterpret_cast(size)); +} + +template +inline constexpr bool dependent_bool_value = Value; + +template +inline constexpr bool dependent_false = dependent_bool_value; + +template +union Vec; + +template <> +union Vec<4> { + uint16_t u16[2]; + uint32_t u32, as_scalar; + float f32; +}; + +template <> +union Vec<8> { + uint16_t u16[4]; + uint32_t u32[2]; + uint64_t u64, as_scalar; + float f32[2]; +}; + +template <> +union alignas(16) Vec<16> { + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + uint4 u128, as_scalar; + float f32[4]; +}; + +template +__device__ __inline__ Vec ld_vec(const T* addr) { + Vec vec; + if constexpr (Alignment == 16) { +#if defined(USE_ROCM) + vec.u128 = *reinterpret_cast(addr); + } else if constexpr (Alignment == 8) { + vec.u64 = *reinterpret_cast(addr); + } else if constexpr (Alignment == 4) { + vec.u32 = *reinterpret_cast(addr); +#else + asm("ld.global.v4.u32 {%0,%1,%2,%3}, [%4];" + : "=r"(vec.u32[0]), "=r"(vec.u32[1]), "=r"(vec.u32[2]), "=r"(vec.u32[3]) + : "l"(addr) + : "memory"); + } else if constexpr (Alignment == 8) { + asm("ld.global.v2.u32 {%0,%1}, [%2];" + : "=r"(vec.u32[0]), "=r"(vec.u32[1]) + : "l"(addr) + : "memory"); + } else if constexpr (Alignment == 4) { + asm("ld.global.u32 %0, [%1];" : "=r"(vec.u32) : "l"(addr) : "memory"); +#endif + } else { + static_assert(dependent_false); + } + return vec; +} + +template +__device__ __inline__ void st_vec(T* addr, const Vec& vec) { + if constexpr (Alignment == 16) { +#if defined(USE_ROCM) + reinterpret_cast(addr)[0] = vec.u64[0]; + reinterpret_cast(addr)[1] = vec.u64[1]; + } else if constexpr (Alignment == 8) { + *reinterpret_cast(addr) = vec.u64; + } else if constexpr (Alignment == 4) { + *reinterpret_cast(addr) = vec.u32; +#else + asm("st.global.v4.u32 [%0], {%1,%2,%3,%4};" + : + : "l"(addr), + "r"(vec.u32[0]), + "r"(vec.u32[1]), + "r"(vec.u32[2]), + "r"(vec.u32[3]) + : "memory"); + } else if constexpr (Alignment == 8) { + asm("st.global.v2.u32 [%0], {%1,%2};" + : + : "l"(addr), "r"(vec.u32[0]), "r"(vec.u32[1]) + : "memory"); + } else if constexpr (Alignment == 4) { + asm("st.global.u32 [%0], %1;" : : "l"(addr), "r"(vec.u32) : "memory"); +#endif + } else { + static_assert(dependent_false); + } +} + + + +} // namespace at::native::memory + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MiscUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MiscUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..e76505cedc08bfdc3e5ed83625a9b5187af3895a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MiscUtils.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + + +namespace at::native { + +static inline int cuda_int_cast(int64_t value, const char* varname) { + auto result = static_cast(value); + TORCH_CHECK(static_cast(result) == value, + "cuda_int_cast: The value of ", varname, "(", (long long)value, + ") is too large to fit into a int (", sizeof(int), " bytes)"); + return result; +} + +// Creates an array of size elements of type T, backed by pinned memory +// wrapped in a Storage +template +static inline Storage pin_memory(int64_t size) { + auto* allocator = cuda::getPinnedMemoryAllocator(); + int64_t adjusted_size = size * sizeof(T); + return Storage( + Storage::use_byte_size_t(), + adjusted_size, + allocator, + /*resizable=*/false); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh new file mode 100644 index 0000000000000000000000000000000000000000..37b4b11a6d9607139b18dfe2f6ef51a9fc5b4685 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh @@ -0,0 +1,387 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include + +namespace at::native { + +namespace { + +static constexpr int64_t kILP = 4; +static constexpr int64_t kChunkSize = 65536; +static constexpr int64_t kBlockSize = 512; + +// TODO(crcrpar): Add `n>5` for `low prec params & their higher prec copy` +// TensorListMetadata has to be < 4KB - the limit for kernel launch argument +static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; +static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; +static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30}; +static constexpr int depth_to_max_tensors_scalarlist_of_complex_double[2] = { + 72, + 60}; + +template +__device__ __forceinline__ bool is_aligned(T* p) { + return ((uint64_t)p) % (kILP * sizeof(T)) == 0; +} + +template +__device__ __forceinline__ void load_store( + T* dst, + T* src, + int64_t dst_offset, + int64_t src_offset) { + using LT = at::native::memory::aligned_vector; + ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset]; +} + +template +struct TensorListMetadata { + const void* addresses[n][depth_to_max_tensors[n - 1]]; + int64_t numel_for_tensor[depth_to_max_tensors[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; + int start_tensor_this_launch; +}; + +template +struct TensorListScalarListMetadata { + const void* addresses[n][depth_to_max_tensors_scalarlist[n - 1]]; + int64_t numel_for_tensor[depth_to_max_tensors_scalarlist[n - 1]]; + scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; +}; + +// note(mkozuki): `n` of 1&2 violate the limit of cuda kernel argument size of +// 4kb with `c10::complex` +template <> +struct TensorListScalarListMetadata, 1> { + const void* addresses[1] + [depth_to_max_tensors_scalarlist_of_complex_double[0]]; + int64_t + numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[0]]; + c10::complex + scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[0]]; + unsigned char block_to_tensor[depth_to_max_blocks[1 - 1]]; + int block_to_chunk[depth_to_max_blocks[1 - 1]]; +}; + +template <> +struct TensorListScalarListMetadata, 2> { + const void* addresses[2] + [depth_to_max_tensors_scalarlist_of_complex_double[1]]; + int64_t + numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[1]]; + c10::complex + scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[1]]; + unsigned char block_to_tensor[depth_to_max_blocks[2 - 1]]; + int block_to_chunk[depth_to_max_blocks[2 - 1]]; +}; + +// NOTE(crcrpar): This is a conservative resolution to handle `state_steps` +// whose each element is `at::Tensor` of 1 element representing the number of +// `step`s called so far. +template +struct FusedOptimizerTensorListMetadata { + const void* addresses[n][depth_to_max_tensors[n - 1]]; + int64_t numel_for_tensor[depth_to_max_tensors[n - 1]]; + const void* state_steps_addresses[depth_to_max_tensors_scalarlist[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; + int start_tensor_this_launch; +}; + +template +C10_LAUNCH_BOUNDS_1(kBlockSize) +__global__ void multi_tensor_apply_kernel( + T tensorListMeta, + U callable, + ArgTypes... args) { + // Hand the chunk information to the user-supplied functor to process however + // it likes. + callable(kChunkSize, tensorListMeta, args...); +} + +} // namespace + +// multi_tensor_apply enables horizontal fusion across lists of tensors. +// For example, whereas you once had a for-loop of a + b = c, where a, b, +// and c are individual tensors in lists as, bs, and cs, you can now with +// fewer kernel launches compute as + bs = cs. +// +// You can also imagine bs to be a scalar list vs a tensor list. +// +// The function below takes in tensor lists, scalars, and a callable and +// chunks up the computation to launch as few kernels as possible by iterating +// through every "chunk" in every tensor (thus the nested for loops). In the +// simplest case, everything gets bundled into just one kernel launch, but +// due to blocksize constraints, we may need to launch multiple kernels. +// Each kernel launch is defined by one tensorListMeta construct, which we +// use to track and reset the necessary metadata for each launch. +template +void multi_tensor_apply( + std::vector>& tensor_lists, + at::ArrayRef scalars, + T callable, + ArgTypes... args) { + TORCH_CHECK( + tensor_lists.size() == depth, + "Number of tensor lists has to match the depth."); + const size_t n_tensors = tensor_lists[0].size(); + using scalar_vals_t = typename T::opmath_t; + TensorListScalarListMetadata tensorListMeta; + + int loc_block_info = 0; + int loc_tensor_info = 0; + for (size_t t = 0; t < n_tensors; t++) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][t].numel() == 0) { + continue; + } + tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to(); + tensorListMeta.numel_for_tensor[loc_tensor_info] = + tensor_lists[0][t].numel(); + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][loc_tensor_info] = + tensor_lists[d][t].const_data_ptr(); + } + loc_tensor_info++; + + // now we enter [chunking territory]. + // we will launch a kernel when EITHER the blocks get filled up OR + // the tensors get filled up. There will always be at least one block + // per tensor since the zero-sized ones will not enter the loop, so + // the nested forloop within represents iterating through the chunks + // of a single tensor. + const auto numel = tensor_lists[0][t].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + for (auto chunk = 0; chunk < chunks; chunk++) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + // a tensor is not considered full unless all its chunks have been + // processed + const bool tensors_full = + (loc_tensor_info == depth_to_max_tensors_scalarlist[depth - 1] && + chunk == chunks - 1); + const bool blocks_full = + (loc_block_info == depth_to_max_blocks[depth - 1]); + + if (tensors_full || blocks_full) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>( + tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + // Reset. + loc_block_info = 0; + // all chunks have already been handled in the kernel + if (chunk == chunks - 1) { + loc_tensor_info = 0; + } else { // blocks were full and tensor chunks remain + tensorListMeta.numel_for_tensor[0] = + tensorListMeta.numel_for_tensor[loc_tensor_info - 1]; + tensorListMeta.scalar_vals[0] = + tensorListMeta.scalar_vals[loc_tensor_info - 1]; + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][0] = + tensorListMeta.addresses[d][loc_tensor_info - 1]; + } + loc_tensor_info = 1; + } + } + } + } + + // note: [finishing what we started] + // if there's remaining work to be done but the tensors/blocks aren't full + // yet we are at the end, submit the kernel to do the work! + if (loc_block_info != 0) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +template +void multi_tensor_apply( + std::vector>& tensor_lists, + T callable, + ArgTypes... args) { + TORCH_CHECK( + tensor_lists.size() == depth, + "Number of tensor lists has to match the depth."); + const size_t n_tensors = tensor_lists[0].size(); + TensorListMetadata tensorListMeta; + tensorListMeta.start_tensor_this_launch = 0; + + int loc_block_info = 0; + int loc_tensor_info = 0; + int processed = 0; + + for (size_t t = 0; t < n_tensors; t++) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][t].numel() == 0) { + continue; + } + processed++; + tensorListMeta.numel_for_tensor[loc_tensor_info] = + tensor_lists[0][t].numel(); + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][loc_tensor_info] = + tensor_lists[d][t].const_data_ptr(); + } + loc_tensor_info++; + + // see note: [chunking territory]. + const auto numel = tensor_lists[0][t].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + for (auto chunk = 0; chunk < chunks; chunk++) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + const bool tensors_full = + (loc_tensor_info == depth_to_max_tensors[depth - 1] && + chunk == chunks - 1); + const bool blocks_full = + (loc_block_info == depth_to_max_blocks[depth - 1]); + + if (tensors_full || blocks_full) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>( + tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + // Reset. + loc_block_info = 0; + if (chunk == chunks - 1) { + loc_tensor_info = 0; + tensorListMeta.start_tensor_this_launch = processed; + } else { + tensorListMeta.numel_for_tensor[0] = + tensorListMeta.numel_for_tensor[loc_tensor_info - 1]; + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][0] = + tensorListMeta.addresses[d][loc_tensor_info - 1]; + } + loc_tensor_info = 1; + tensorListMeta.start_tensor_this_launch = processed - 1; + } + } + } + } + + // see note: [finishing what we started] + if (loc_block_info != 0) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +template +void multi_tensor_apply_for_fused_optimizer( + std::vector>& tensor_lists, + at::TensorList state_steps, + T callable, + ArgTypes... args) { + TORCH_CHECK( + tensor_lists.size() == depth, + "Number of tensor lists has to match the depth"); + const auto num_tensors = tensor_lists[0].size(); + FusedOptimizerTensorListMetadata tensorListMeta; + + int loc_block_info = 0; + int loc_tensor_info = 0; + for (const auto& tensor_index : c10::irange(num_tensors)) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][tensor_index].numel() == 0) { + continue; + } + tensorListMeta.state_steps_addresses[loc_tensor_info] = + state_steps[tensor_index].const_data_ptr(); + tensorListMeta.numel_for_tensor[loc_tensor_info] = + tensor_lists[0][tensor_index].numel(); + for (const auto& d : c10::irange(depth)) { + tensorListMeta.addresses[d][loc_tensor_info] = + tensor_lists[d][tensor_index].const_data_ptr(); + } + loc_tensor_info++; + + // see above note: [chunking territory] + const auto numel = tensor_lists[0][tensor_index].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + TORCH_CHECK(chunks > -1); + for (const auto& chunk : c10::irange(chunks)) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + const auto tensor_full = + (loc_tensor_info == depth_to_max_tensors[depth - 1] && + chunk == chunks - 1); + const auto blocks_full = loc_block_info == depth_to_max_blocks[depth - 1]; + + if (tensor_full || blocks_full) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>( + tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + // Reset. + loc_block_info = 0; + if (chunk == chunks - 1) { + loc_tensor_info = 0; + } else { + tensorListMeta.numel_for_tensor[0] = + tensorListMeta.numel_for_tensor[loc_tensor_info - 1]; + tensorListMeta.state_steps_addresses[0] = + tensorListMeta.state_steps_addresses[loc_tensor_info - 1]; + for (const auto& d : c10::irange(depth)) { + tensorListMeta.addresses[d][0] = + tensorListMeta.addresses[d][loc_tensor_info - 1]; + } + loc_tensor_info = 1; + } + } + } + } + + // see above note: [finishing what we've started] + if (loc_block_info != 0) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Normalization.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Normalization.cuh new file mode 100644 index 0000000000000000000000000000000000000000..bfd2d30570b6d2f4b96f88aa4ca70a4cd9f4bae5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Normalization.cuh @@ -0,0 +1,1778 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#endif + +namespace at::native { + +// The maximum number of threads in a block +#if defined(USE_ROCM) +constexpr int MAX_BLOCK_SIZE = 1024; +#else +constexpr int MAX_BLOCK_SIZE = 512; +#endif + +constexpr unsigned MAX_GRID_SIZE = 65535u; + +// Number of threads in a block given an input size up to MAX_BLOCK_SIZE +static int getNumThreads(int nElem) { +#if defined(USE_ROCM) + int threadSizes[5] = { 64, 128, 256, 512, MAX_BLOCK_SIZE }; +#else + int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE }; +#endif + for (int i = 0; i != 5; ++i) { + if (nElem <= threadSizes[i]) { + return threadSizes[i]; + } + } + return MAX_BLOCK_SIZE; +} + +// Returns the index of the most significant 1 bit in `val`. +__device__ __forceinline__ int getMSB(int val) { + return 31 - __clz(val); +} + +template +struct Float2 { + accscalar_t v1, v2; + __device__ Float2() {} + __device__ Float2(scalar_t v1, scalar_t v2) : v1(static_cast(v1)), v2(static_cast(v2)) {} + __device__ Float2(int v) : v1(static_cast(v)), v2(static_cast(v)) {} + __device__ Float2& operator+=(const Float2& a) { + v1 += a.v1; + v2 += a.v2; + return *this; + } + __device__ friend Float2 operator+(Float2 a, const Float2& b) { + a += b; + return a; + } +}; + +template +struct GradOp { + __device__ GradOp(accscalar_t m, const PTA& i, const PTA& g) + : mean(m), input(i), grad_output(g) {} + __device__ __forceinline__ Float2 operator()(int batch, int plane, int n) { + accscalar_t g = grad_output[batch][plane][n]; + accscalar_t c = static_cast(input[batch][plane][n]) - mean; + return Float2(g, g * c); + } + const accscalar_t mean; + const PTA& input; + const PTA& grad_output; +}; + +template +struct SumReduceOp { + __device__ __forceinline__ acc_t combine(acc_t a, acc_t b) const { return a + b; } + + __device__ __forceinline__ acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +}; + +template +struct SumReduceOp> { + using acc_t = Float2; + + __device__ __forceinline__ acc_t combine(acc_t a, acc_t b) const { return a + b; } + + __device__ __forceinline__ acc_t warp_shfl_down(acc_t data, int offset) const { + return {WARP_SHFL_DOWN(data.v1, offset), WARP_SHFL_DOWN(data.v2, offset)}; + } +}; + +// Sum across (batch, x/y/z) applying Op() pointwise +// this works by first having each thread sum it's part +// of the data. Then there is a double-shuffling reduction. +// First each warp (of C10_WARP_SIZE threads) uses warpSum to reduce its +// data to the "warp leader", who writes its value into shared memory. +// Then a single warp reads the remaining (at most C10_WARP_SIZE) items +// and reduces them using another warpSum. +// The implicit assumption is that there are no more +// than C10_WARP_SIZE**2 threads. +template +__device__ scalar_t reduce(Op op, PTA tensor, int plane) { + // first the reductions each thread does separately + scalar_t sum = static_cast(0); + for (int batch = threadIdx.y; batch < tensor.size(0); batch += blockDim.y) { +#if defined(USE_ROCM) + constexpr int UNRL = 4; // load deserilize factor + scalar_t tmp[UNRL]; + for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x*UNRL) { +#pragma unroll + for (int u = 0; u < UNRL; u++) + tmp[u] = op(batch, plane, std::min((int)tensor.size(2)-1, (int)(x+u*blockDim.x))); +#pragma unroll + for (int u = 0; u < UNRL; u++) + if (x+u*blockDim.x < tensor.size(2)) + sum += tmp[u]; + } +#else + for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x) { + sum += op(batch, plane, x); + } +#endif + } + __shared__ scalar_t shared[C10_WARP_SIZE]; + SumReduceOp reduce_op; + sum = cuda_utils::BlockReduce, cuda_utils::Block2D>(sum, reduce_op, 0, shared); + if (threadIdx.x == 0 && threadIdx.y == 0) { + shared[0] = sum; + } + __syncthreads(); + // Everyone picks it up, should be broadcast into the whole grad_input + return shared[0]; +} + +constexpr int ELEMENTS_PER_ITER = 4; // enables concurrency within each thread to hide latency +constexpr int ELEMENTS_PER_THREAD = 16; +constexpr int OPTIMAL_TILE_W = 32; +constexpr int MAX_H_BLOCK = 128; + +__host__ void flexible_launch_configs( + const int reduction, + const int stride, + dim3 &block, + dim3 &grid, + const bool coop_flag = false) { + int block_x = std::min(lastPow2(stride), OPTIMAL_TILE_W); + int block_y = std::min(lastPow2(at::ceil_div(reduction , ELEMENTS_PER_THREAD)), + MAX_BLOCK_SIZE / block_x); + if (block_x * block_y != MAX_BLOCK_SIZE) { + block_x = std::min(lastPow2(stride), MAX_BLOCK_SIZE / block_y); + } + + int grid_x = at::ceil_div(stride, block_x); + int grid_y = std::min(at::ceil_div(reduction, block_y * ELEMENTS_PER_THREAD), MAX_H_BLOCK); + if (coop_flag) { + // it's not worth having a grid reduction if the reduction dimension is not big enough + grid_y = grid_y < 8 ? 1 : grid_y; + } + + block.x = block_x; + block.y = block_y; + block.z = 1; + grid.x = grid_x; + grid.y = grid_y; + grid.z = 1; +} + +template +__device__ __forceinline__ void welford_merge_element(C& count, + T& mean, + T& m2n, + const C& count_new, + const T& mean_new, + const T& m2n_new) { + T factor = T(1.0) / ::max(1, (count + count_new)); + T delta0 = mean - mean_new; + mean = (mean_new * count_new + mean * count) * factor; + m2n += m2n_new + delta0 * delta0 * count_new * count * factor; + count += count_new; +} + +// merge mean/m2n among threadIdx.y within block +template +__device__ __forceinline__ void welford_merge_block_vertical(C& count, + T& mean, + T& m2n, + C* shmem_count, + T* shmem_mean, + T* shmem_m2n) { + // write to shared memory + auto address_base = threadIdx.x + threadIdx.y * blockDim.x; + +#pragma unroll + for (int offset = blockDim.y/2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset*2) { + shmem_mean[address_base] = mean; + shmem_m2n[address_base] = m2n; + shmem_count[address_base] = count; + } + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + auto address = address_base + offset * blockDim.x; + // read shared memory back to register for reduction + auto count_new = shmem_count[address]; + auto mean_new = shmem_mean[address]; + auto m2n_new = shmem_m2n[address]; + + welford_merge_element(count, mean, m2n, count_new, mean_new, m2n_new); + } + } +} + +template +__global__ void batch_norm_transform_input_kernel( + const GenericPackedTensorAccessor input, + GenericPackedTensorAccessor output, + const GenericPackedTensorAccessor, 1, RestrictPtrTraits, index_t> mean_, + const GenericPackedTensorAccessor, 1, RestrictPtrTraits, index_t> var_or_invstd, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor bias, + stat_accscalar_t epsilon) { + + index_t plane = blockIdx.x; + + if (plane >= input.size(1)) { + return; + } + + stat_accscalar_t gamma = weight.size(0) > 0 ? static_cast(weight[plane]) : static_cast(1); + stat_accscalar_t beta = bias.size(0) > 0 ? static_cast(bias[plane]) : static_cast(0); + stat_accscalar_t mean = static_cast(mean_[plane]); + stat_accscalar_t invstd; + if (train) { + invstd = var_or_invstd[plane]; + } else { + invstd = static_cast(1) / device_sqrt(static_cast(var_or_invstd[plane]) + epsilon); + } + + index_t bs = input.size(0); + index_t fs = input.size(2); + + index_t bstep = blockDim.y * gridDim.y; + for (index_t batch = threadIdx.y + blockIdx.y * blockDim.y; batch < bs; batch += bstep) { + auto o = output[batch][plane]; + auto i = input[batch][plane]; + for (index_t feature = threadIdx.x; feature < fs; feature += blockDim.x) { + o[feature] = static_cast(gamma * (i[feature] - mean) * invstd + beta); + } + } +} + +struct InvStd { + template + __device__ __forceinline__ T operator()(T var, double epsilon) const { + T invstd = 0; + if (var != static_cast(0) || epsilon != static_cast(0)) { + invstd = static_cast(1) / device_sqrt(var + epsilon); + } + return invstd; + } +}; + +struct Var { + template + __device__ __forceinline__ T operator()(T var, double epsilon) const { + return var; + } +}; + +template +__global__ void batch_norm_collect_statistics_kernel( + const GenericPackedTensorAccessor input, + const stat_accscalar_t epsilon, + const stat_accscalar_t momentum, + GenericPackedTensorAccessor save_mean, + GenericPackedTensorAccessor save_transformed_var) { + + __shared__ int shared_n[2 * 2 * C10_WARP_SIZE + C10_WARP_SIZE]; + + int plane = blockIdx.x; + int N = input.size(0) * input.size(2); + int tid = threadIdx.x + threadIdx.y * blockDim.x; + + // Compute the mean and variance across (batch, x/y/z) + // this uses the Welford (in the for loop)/parallel algorithm (to sum across the block) + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_Online_algorithm + // and the parallel algorithm on the same page. + // We use two shuffles to reduce across the entire block. + // https://devblogs.nvidia.com/faster-parallel-reductions-kepler/ has a description. + stat_accscalar_t* shared_avg_var = (stat_accscalar_t*) &shared_n[C10_WARP_SIZE]; + + // first the reductions each thread does separately + stat_accscalar_t avg = 0; + stat_accscalar_t var_n = 0; + int n = 0; + for (int batch = threadIdx.y; batch < input.size(0); batch += blockDim.y) { +#if defined(USE_ROCM) + constexpr int UNRL = 4; + stat_accscalar_t v_[UNRL]; + for (int x = threadIdx.x; x < input.size(2); x += blockDim.x*UNRL) { + for (int u = 0; u < UNRL; u++) + v_[u] = input[batch][plane][std::min(x+u*blockDim.x, input.size(2)-1)]; + for (int u = 0; u < UNRL; u++) { + if (x+u*blockDim.x < input.size(2)) { + stat_accscalar_t d1 = v_[u] - avg; + n++; + avg += d1 / n; + var_n += d1 * (v_[u] - avg); + } + } + } +#else + for (int x = threadIdx.x; x < input.size(2); x += blockDim.x) { + stat_accscalar_t v = input[batch][plane][x]; + stat_accscalar_t d1 = v - avg; + n++; + avg += d1 / n; + var_n += d1 * (v - avg); + } +#endif + } + + // first warpSum to get one value per thread to + // one value per warp + for (int i = 0; i < getMSB(C10_WARP_SIZE); ++i) { + stat_accscalar_t o_avg = WARP_SHFL_XOR(avg, 1 << i, C10_WARP_SIZE); + int o_n = WARP_SHFL_XOR(n, 1 << i, C10_WARP_SIZE); + stat_accscalar_t factor = 1.0 / fmaxf(1.0, n+o_n); + var_n += WARP_SHFL_XOR(var_n, 1 << i, C10_WARP_SIZE) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor; + avg = (n * avg + o_n * o_avg) * factor; + n += o_n; + } + + // this writes each warps item into shared memory + // there are at most C10_WARP_SIZE items left because + // there are at most C10_WARP_SIZE**2 threads at the beginning + __syncthreads(); + if (tid % C10_WARP_SIZE == 0) { + shared_n[tid / C10_WARP_SIZE] = n; + shared_avg_var[tid / C10_WARP_SIZE * 2] = avg; + shared_avg_var[tid / C10_WARP_SIZE * 2 + 1] = var_n; + } + __syncthreads(); + // now have a second warpSum to reduce the intermediate values + // from shared memory to a single number. The very first + // thread writes it to shared memory. + + if (tid < C10_WARP_SIZE) { + n = (tid < blockDim.x * blockDim.y / C10_WARP_SIZE ? shared_n[tid] : 0); + avg = (tid < blockDim.x * blockDim.y / C10_WARP_SIZE ? shared_avg_var[2 * tid] : stat_accscalar_t(0)); + var_n = (tid < blockDim.x * blockDim.y / C10_WARP_SIZE ? shared_avg_var[2 * tid + 1] : stat_accscalar_t(0)); + } + for (int i = 0; i < getMSB(C10_WARP_SIZE); ++i) { + stat_accscalar_t o_avg = WARP_SHFL_XOR(avg, 1 << i, C10_WARP_SIZE); + int o_n = WARP_SHFL_XOR(n, 1 << i, C10_WARP_SIZE); + stat_accscalar_t factor = 1.0 / fmaxf(1.0, n+o_n); + var_n += WARP_SHFL_XOR(var_n, 1 << i, C10_WARP_SIZE) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor; + avg = (n * avg + o_n * o_avg) * factor; + n += o_n; + } + + // Save the mean, variance, and moving averages + if (tid == 0) { + if (save_mean.data() != NULL) { + save_mean[plane] = avg; + } + if (save_transformed_var.data() != NULL) { + save_transformed_var[plane] = VarTransform{}(var_n / N, epsilon); + } + } + +} + +template +__global__ void batch_norm_backward_kernel( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + GenericPackedTensorAccessor grad_input, + GenericPackedTensorAccessor grad_weight, + GenericPackedTensorAccessor grad_bias, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor running_mean, + const GenericPackedTensorAccessor running_var, + const GenericPackedTensorAccessor save_mean, + const GenericPackedTensorAccessor save_invstd, + bool train, + stat_accscalar_t epsilon) { + + index_t plane = blockIdx.x; + index_t N = grad_output.size(0) * grad_output.size(2); + + stat_accscalar_t mean, invstd; + if (train) { + mean = save_mean[plane]; + invstd = save_invstd[plane]; + } else { + mean = static_cast(running_mean[plane]); + invstd = static_cast(1) / device_sqrt(static_cast(running_var[plane]) + epsilon); + } + + stat_accscalar_t weight_val = weight.size(0) > 0 ? static_cast(weight[plane]) : stat_accscalar_t(1); + stat_accscalar_t norm = stat_accscalar_t(1) / N; + + // Compute two values across (batch, x/y/z) in one pass: + // 1. Sum(grad_output) + // 2. DotProduct(input - mean, grad_output) + GradOp> g(mean, input, grad_output); + auto res = reduce>(g, grad_output, plane); + + stat_accscalar_t grad_output_sum = res.v1; + stat_accscalar_t dot_p = res.v2; + + stat_accscalar_t grad_mean = grad_output_sum * norm; + stat_accscalar_t proj_scale = dot_p * norm * invstd * invstd; + stat_accscalar_t grad_scale = invstd * weight_val; + + if (grad_input.data() != NULL) { + for (int batch = threadIdx.y; batch < grad_output.size(0); batch += blockDim.y) { + for (int x = threadIdx.x; x < grad_output.size(2); x += blockDim.x) { + input_scalar_t go = grad_output[batch][plane][x]; + if (train) { + stat_accscalar_t inp = input[batch][plane][x]; + stat_accscalar_t proj = (inp - mean) * proj_scale; + grad_input[batch][plane][x] = static_cast((go - proj - grad_mean) * grad_scale); + } else { + grad_input[batch][plane][x] = static_cast(go * grad_scale); + } + } + } + } + + if (grad_weight.size(0) > 0) { + if (threadIdx.x == 0) { + grad_weight[plane] = static_cast(dot_p * invstd); + } + } + + if (grad_bias.size(0) > 0) { + if (threadIdx.x == 0) { + grad_bias[plane] = static_cast(grad_output_sum); + } + } +} + +template +__global__ void batch_norm_reduce_statistics_kernel( + const GenericPackedTensorAccessor vec_mean, + const GenericPackedTensorAccessor vec_invstd, + GenericPackedTensorAccessor mean, + GenericPackedTensorAccessor invstd, + GenericPackedTensorAccessor running_mean, + GenericPackedTensorAccessor running_var, + const accscalar_t epsilon, + const accscalar_t momentum, + const GenericPackedTensorAccessor counts) { + + int feature_size = vec_mean.size(1); + int world_size = vec_mean.size(0); + + int bid = blockIdx.x; + int tid = threadIdx.x; + + // first the reductions each thread does separately + for (int i = bid*blockDim.x+tid; i < feature_size; i += gridDim.x*blockDim.x) { + accscalar_t avg = 0; + accscalar_t var_n = 0; + index_t n = 0; + for (int j = 0; j < world_size; j++) { + scalar_t count = counts[j]; + accscalar_t m = vec_mean[j][i]; + accscalar_t v = accscalar_t(1.0) / (vec_invstd[j][i]); + v = (v * v - epsilon) * count; + accscalar_t factor = 1.0 / (n + count); + var_n += v + (avg - m) * (avg - m) * n * count * factor; + avg = n * factor * avg + count * factor * m; + n += count; + } + mean[i] = avg; + invstd[i] = static_cast(1) / device_sqrt(var_n / n + epsilon); + if (running_mean.data() != NULL) { + running_mean[i] = static_cast((1 - momentum) * running_mean[i] + momentum * avg); + } + accscalar_t unbiasedVar = var_n / (n - 1); + if (running_var.data() != NULL) { + running_var[i] = static_cast((1 - momentum) * running_var[i] + momentum * unbiasedVar); + } + } + +} + +template +__global__ void batch_norm_backward_reduce_kernel( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + GenericPackedTensorAccessor mean, + GenericPackedTensorAccessor invstd, + GenericPackedTensorAccessor sum_dy, + GenericPackedTensorAccessor sum_dy_xmu, + GenericPackedTensorAccessor grad_weight, + GenericPackedTensorAccessor grad_bias) { + + index_t plane = blockIdx.x; + + stat_accscalar_t r_mean = mean[plane]; + stat_accscalar_t factor = invstd[plane]; + + GradOp> g(r_mean, input, grad_output); + auto res = reduce>(g, grad_output, plane); + + if (threadIdx.x == 0) { + if (grad_weight.size(0) > 0) { + grad_weight[plane] = static_cast(res.v2 * factor); + } + if (grad_bias.size(0) > 0) { + grad_bias[plane] = static_cast(res.v1); + } + if (sum_dy.size(0) > 0) { + sum_dy[plane] = static_cast(res.v1); + } + if (sum_dy_xmu.size(0) > 0) { + sum_dy_xmu[plane] = static_cast(res.v2); + } + } +} + +template +__device__ __forceinline__ void batch_norm_backward_elemt_kernel_impl( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + const GenericPackedTensorAccessor mean, + const GenericPackedTensorAccessor invstd, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor sum_dy, + const GenericPackedTensorAccessor sum_dy_xmu, + GenericPackedTensorAccessor grad_input, + const stat_accscalar_t norm_fct) { + index_t plane = blockIdx.x; + + if (plane >= input.size(1)) { + return; + } + + stat_accscalar_t m_c = mean[plane]; + stat_accscalar_t m_dy_c = sum_dy[plane] * norm_fct; + stat_accscalar_t factor_1_c = invstd[plane]; + stat_accscalar_t factor_2_c = weight.size(0) > 0 ? static_cast(weight[plane]) : stat_accscalar_t(1); + factor_2_c *= factor_1_c; + factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[plane] * norm_fct; + + index_t bs = input.size(0); + index_t fs = input.size(2); + + index_t bstep = blockDim.y * gridDim.y; + for (index_t batch = threadIdx.y + blockIdx.y * blockDim.y; batch < bs; batch += bstep) { + auto g_i = grad_input[batch][plane]; + auto g_o = grad_output[batch][plane]; + auto i = input[batch][plane]; + for (index_t feature = threadIdx.x; feature < fs; feature += blockDim.x) { + g_i[feature] = static_cast((g_o[feature] - m_dy_c - (i[feature] - m_c) * factor_1_c) * factor_2_c); + } + } +} + +template +__global__ void batch_norm_backward_elemt_kernel( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + const GenericPackedTensorAccessor mean, + const GenericPackedTensorAccessor invstd, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor sum_dy, + const GenericPackedTensorAccessor sum_dy_xmu, + GenericPackedTensorAccessor grad_input, + const int* __restrict__ numel, const int world_size) { + int64_t total_numel = 0; + for (int i = 0; i < world_size; i ++) { + total_numel += numel[i]; + } + + const stat_accscalar_t norm_fct = + static_cast(1) / static_cast(total_numel); + batch_norm_backward_elemt_kernel_impl( + input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct); +} + +template +__global__ void batch_norm_backward_elemt_kernel( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + const GenericPackedTensorAccessor mean, + const GenericPackedTensorAccessor invstd, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor sum_dy, + const GenericPackedTensorAccessor sum_dy_xmu, + GenericPackedTensorAccessor grad_input, + const stat_accscalar_t norm_fct) { + batch_norm_backward_elemt_kernel_impl( + input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct); +} + +template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> +static GenericPackedTensorAccessor get_packed_accessor( + const Tensor& t, std::string_view var_name) { + constexpr auto expect_type = c10::CppTypeToScalarType>::value; + const auto actual_type = t.scalar_type(); + TORCH_CHECK(actual_type == expect_type, "Expected ", var_name, + " to have type ", expect_type, " but got ", actual_type); + return t.generic_packed_accessor(); +} + +template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> +static GenericPackedTensorAccessor packed_accessor_or_dummy( + const Tensor& t, std::string_view var_name) { + if (!t.defined()) { + const std::array zeros{{0}}; + return GenericPackedTensorAccessor(nullptr, zeros.data(), zeros.data()); + } + return get_packed_accessor(t, var_name); +} + +template +std::tuple batch_norm_backward_cuda_template(const Tensor& grad_out_, const Tensor& input_, const Tensor& weight_, + const Tensor& running_mean_, const Tensor& running_var_, const Tensor& save_mean_, const Tensor& save_invstd_, + bool train, double epsilon, std::array grad_input_mask) { + + using accscalar_t = at::acc_type; + Tensor grad_input_; + Tensor grad_input_reshaped; + Tensor grad_weight_; + Tensor grad_bias_; + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); + auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes()); + + if (grad_input_mask[0]) { + grad_input_ = at::empty_like(input_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + grad_input_reshaped = grad_input_.view(input_reshaped.sizes()); + } + if (grad_input_mask[1]) { + grad_weight_ = at::empty_like(weight_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (grad_input_mask[2]) { + grad_bias_ = at::empty_like(weight_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + + auto input = get_packed_accessor< + const input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); + auto grad_output = get_packed_accessor< + const input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); + auto grad_input = packed_accessor_or_dummy< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input"); + auto weight = packed_accessor_or_dummy< + const stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight"); + auto grad_weight = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight"); + auto grad_bias = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias"); + auto running_mean = packed_accessor_or_dummy< + const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_mean_, "running_mean"); + auto running_var = packed_accessor_or_dummy< + const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_var_, "running_var"); + auto save_mean = packed_accessor_or_dummy< + const accscalar_t, 1, DefaultPtrTraits, index_t>(save_mean_, "save_mean"); + auto save_invstd = packed_accessor_or_dummy< + const accscalar_t, 1, DefaultPtrTraits, index_t>(save_invstd_, "save_invstd"); + + auto stream = at::cuda::getCurrentCUDAStream(); + dim3 blocks(input.size(1)); + int tf = getNumThreads(input.size(2)); + dim3 threads(tf, std::max(1, MAX_BLOCK_SIZE/tf)); + + batch_norm_backward_kernel <<>> + (input, grad_output, grad_input, grad_weight, grad_bias, weight, running_mean, running_var, + save_mean, save_invstd, train, epsilon); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return std::make_tuple(grad_input_, grad_weight_, grad_bias_); +} + +template +void batch_norm_stats_cuda_template( + const Tensor& out_mean, const Tensor& out_invstd, const Tensor& input_, double epsilon) { + + using accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + Tensor dummy_mean_; + Tensor dummy_var_; + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + + resize_output(out_mean, {n_input}); + resize_output(out_invstd, {n_input}); + auto input = get_packed_accessor< + const scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input"); + TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() && + out_invstd.sizes()[0]); + TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() && + out_mean.sizes()[0]); + + auto mean = packed_accessor_or_dummy< + accscalar_t, 1, RestrictPtrTraits, index_t>(out_mean, "out_mean"); + auto invstd = packed_accessor_or_dummy< + accscalar_t, 1, RestrictPtrTraits, index_t>(out_invstd, "out_invstd"); + auto stream = at::cuda::getCurrentCUDAStream(); + + dim3 blocks(input.size(1)); + int tf = getNumThreads(input.size(2)); + dim3 threads(tf, std::max(1, MAX_BLOCK_SIZE/tf)); + batch_norm_collect_statistics_kernel <<>> + (input, epsilon, 0.0, mean, invstd); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void batch_norm_elemt_cuda_template(const Tensor& output_, const Tensor& input_, const Tensor& weight_, + const Tensor& bias_, const Tensor& mean_, const Tensor& invstd_) { + + using stat_accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + auto output_reshaped = output_.view({input_.size(0), input_.size(1), -1}); + + auto input = get_packed_accessor< + const input_scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input"); + auto output = get_packed_accessor< + input_scalar_t, 3, RestrictPtrTraits, index_t>(output_reshaped, "output"); + auto weight = packed_accessor_or_dummy< + const stat_scalar_t, 1, RestrictPtrTraits, index_t>(weight_, "weight"); + auto bias = packed_accessor_or_dummy< + const stat_scalar_t, 1, RestrictPtrTraits, index_t>(bias_, "bias"); + auto mean = packed_accessor_or_dummy< + stat_accscalar_t, 1, RestrictPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + stat_accscalar_t, 1, RestrictPtrTraits, index_t>(invstd_, "invstd"); + auto stream = at::cuda::getCurrentCUDAStream(); + + // NOTE: We use transform_input_kernel in training mode, which ignores epsilon + const double dummy_epsilon = 1e-5; + + // The input_transform kernel is pointwise, but we need to balance reading parameters (save_var/mean, + // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks + // and good occupancy. Quiet likely, we could go with even more blocks than 1024. + // The various planes are independent, so we use blocks for them. + int tf = std::max(getNumThreads(input.size(2)/4), + std::min(getNumThreads(input.size(2)), 64)); + int tb = std::max(64/tf, 1); + dim3 blocks_trans(input.size(1), std::max(1, std::min((256*1024)/input.size(1), + (input.size(0)+tb-1)/tb))); + blocks_trans.y = std::min(blocks_trans.y, MAX_GRID_SIZE); + dim3 threads_trans(tf, tb); + batch_norm_transform_input_kernel <<>> + (input, output, mean, invstd, weight, bias, dummy_epsilon); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +std::tuple batch_norm_gather_stats_cuda_template(const Tensor& mean_, const Tensor& invstd_, + const Tensor& running_mean_, const Tensor& running_var_, + double momentum, double epsilon, const Tensor& counts_) { + + Tensor save_mean_; + Tensor save_invstd_; + + auto features = mean_.size(1); + auto input_options = mean_.options(); + if (mean_.scalar_type() == at::ScalarType::Half || mean_.scalar_type() == at::ScalarType::BFloat16) { + input_options = input_options.dtype(ScalarType::Float); + } + save_mean_ = at::empty({features}, input_options); + save_invstd_ = at::empty({features}, input_options); + + auto mean = packed_accessor_or_dummy< + accscalar_t, 2, RestrictPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + accscalar_t, 2, RestrictPtrTraits, index_t>(invstd_, "invstd"); + auto running_mean = packed_accessor_or_dummy< + scalar_t, 1, RestrictPtrTraits, index_t>(running_mean_, "running_mean"); + auto running_var = packed_accessor_or_dummy< + scalar_t, 1, RestrictPtrTraits, index_t>(running_var_, "running_mean"); + auto counts = packed_accessor_or_dummy< + scalar_t, 1, RestrictPtrTraits, index_t>(counts_, "counts"); + + auto save_mean = get_packed_accessor< + accscalar_t, 1, RestrictPtrTraits, index_t>(save_mean_, "save_mean"); + auto save_invstd = get_packed_accessor< + accscalar_t, 1, RestrictPtrTraits, index_t>(save_invstd_, "save_invstd"); + auto stream = at::cuda::getCurrentCUDAStream(); + + int block = getNumThreads(features); + int grid = std::max(1, features/block); + batch_norm_reduce_statistics_kernel <<>> + (mean, invstd, save_mean, save_invstd, running_mean, running_var, epsilon, momentum, counts); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return std::make_tuple(save_mean_, save_invstd_); +} + +template +std::tuple batch_norm_backward_reduce_cuda_template(const Tensor& grad_out_, const Tensor& input_, + const Tensor& mean_, const Tensor& invstd_, const Tensor& weight_, + const bool input_g, const bool weight_g, const bool bias_g) { + + using stat_accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + Tensor sum_dy_; + Tensor sum_dy_xmu_; + Tensor grad_weight_; + Tensor grad_bias_; + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes()); + + if (input_g) { + sum_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + sum_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (weight_g) { + grad_weight_ = at::empty({n_input}, weight_.options()); + } + if (bias_g) { + grad_bias_ = at::empty({n_input}, weight_.options()); + } + + auto input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); + auto grad_output = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); + auto grad_weight = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight"); + auto grad_bias = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias"); + auto mean = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd"); + auto sum_dy = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy"); + auto sum_dy_xmu = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu"); + + auto batch_size = input_reshaped.size(0); + auto feature_size = input_reshaped.size(2); + auto stream = at::cuda::getCurrentCUDAStream(); + + int warp_size = at::cuda::warp_size(); + int block_y = std::min(lastPow2(batch_size), MAX_BLOCK_SIZE/warp_size); + // We want block_x to be at least a warp width + int block_x = std::min(std::max(getNumThreads(feature_size), warp_size), MAX_BLOCK_SIZE/block_y); + const dim3 block(block_x, block_y); + const dim3 grid(n_input); + + batch_norm_backward_reduce_kernel <<>> + (input, grad_output, mean, invstd, sum_dy, sum_dy_xmu, grad_weight, grad_bias); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_); +} + +template +Tensor batch_norm_backward_elemt_cuda_template(const Tensor& grad_out_, const Tensor& input_, + const Tensor& mean_, const Tensor& invstd_, + const Tensor& weight_, const Tensor& sum_dy_, const Tensor& sum_dy_xmu_) { + + using stat_accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes()); + auto grad_input_reshaped = at::empty_like(input_reshaped, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + + auto input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); + auto grad_input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input"); + auto grad_output = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); + auto mean = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd"); + auto weight = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight"); + auto sum_dy = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy"); + auto sum_dy_xmu = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu"); + + auto stream = at::cuda::getCurrentCUDAStream(); + + // The kernel is pointwise, but we need to balance reading parameters (save_var/mean, + // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks + // and good occupancy. Quiet likely, we could go with even more blocks than 1024. + // The various planes are independent, so we use blocks for them. + int tf = std::max(getNumThreads(input.size(2)/4), + std::min(getNumThreads(input.size(2)), 64)); + int tb = std::max(64/tf, 1); + dim3 blocks_trans(input.size(1), std::max(1, std::min((256*1024)/input.size(1), + (input.size(0)+tb-1)/tb))); + blocks_trans.y = std::min(blocks_trans.y, MAX_GRID_SIZE); + dim3 threads_trans(tf, tb); + auto reduction_size = input_.numel() / n_input; + auto norm_fct = static_cast(1.0 / reduction_size); + batch_norm_backward_elemt_kernel + <<>> + (input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return grad_input_reshaped.view(input_.sizes()); +} + +template +Tensor batch_norm_backward_elemt_cuda_template(const Tensor& grad_out_, const Tensor& input_, + const Tensor& mean_, const Tensor& invstd_, + const Tensor& weight_, const Tensor& sum_dy_, const Tensor& sum_dy_xmu_, const Tensor& count) { + + using stat_accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes()); + auto grad_input_reshaped = at::empty_like(input_reshaped, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + + auto input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); + auto grad_input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input"); + auto grad_output = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); + auto mean = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd"); + auto weight = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight"); + auto sum_dy = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy"); + auto sum_dy_xmu = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu"); + + auto stream = at::cuda::getCurrentCUDAStream(); + + // The kernel is pointwise, but we need to balance reading parameters (save_var/mean, + // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks + // and good occupancy. Quiet likely, we could go with even more blocks than 1024. + // The various planes are independent, so we use blocks for them. + int tf = std::max(getNumThreads(input.size(2)/4), + std::min(getNumThreads(input.size(2)), 64)); + int tb = std::max(64/tf, 1); + dim3 blocks_trans(input.size(1), std::max(1, std::min((256*1024)/input.size(1), + (input.size(0)+tb-1)/tb))); + blocks_trans.y = std::min(blocks_trans.y, MAX_GRID_SIZE); + dim3 threads_trans(tf, tb); + batch_norm_backward_elemt_kernel <<>> + (input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, count.const_data_ptr(), count.numel()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return grad_input_reshaped.view(input_.sizes()); +} + +// welford kernel for c last tensor calculating mean/biased_variance/unbiased_variance +// original apex name: welford_kernel_c_last +template + +__global__ void +batch_norm_collect_statistics_channels_last_kernel( + const scalar_t* __restrict__ input, + accscalar_t* __restrict__ out_mean, + accscalar_t* __restrict__ out_invstd, + volatile accscalar_t* staging_data, + int* semaphores, + const int reduction_size, + const int stride, + accscalar_t epsilon) { + // hide latency with concurrency + accscalar_t x_mean[PARALLEL_LOADS]; + accscalar_t m_2_n[PARALLEL_LOADS]; + int count[PARALLEL_LOADS]; + +#pragma unroll + for (int i = 0; i < PARALLEL_LOADS; i++) { + x_mean[i] = accscalar_t(0); + m_2_n[i] = accscalar_t(0); + count[i] = accscalar_t(0); + } + // tensor dimension (m,c) + + // loop along m dimension + int inner_loop_stride = blockDim.y * gridDim.y; + + // offset along m dimension + int m_offset = blockIdx.y * blockDim.y + threadIdx.y; + int c_offset = blockIdx.x * blockDim.x + threadIdx.x; + + int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS); + int address_base = m_offset * stride + c_offset; + int address_increment = inner_loop_stride * stride; + + for (int i = 0; i < loop_count; i++) { + accscalar_t x_math[PARALLEL_LOADS]; + accscalar_t x_count_inv[PARALLEL_LOADS]; + accscalar_t is_valid[PARALLEL_LOADS]; + + // load multiple data in +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + if (c_offset < stride && m_offset < reduction_size) { + x_math[j] = input[address_base]; + count[j]++; + x_count_inv[j] = accscalar_t(1) / count[j]; + is_valid[j] = accscalar_t(1); + } else { + x_math[j] = accscalar_t(0); + x_count_inv[j] = accscalar_t(0); + is_valid[j] = accscalar_t(0); + } + m_offset += inner_loop_stride; + address_base += address_increment; + } + + // calculate mean/m2n with welford +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + accscalar_t delta0 = x_math[j] - x_mean[j]; + x_mean[j] += delta0 * x_count_inv[j]; + accscalar_t delta1 = x_math[j] - x_mean[j]; + m_2_n[j] += delta0 * delta1 * is_valid[j]; + } + } + + // thread reduction to accumulate mean/m_2_n/count between PARALLEL_LOADS +#pragma unroll + for (int j = 1; j < PARALLEL_LOADS; j++) { + welford_merge_element(count[0], x_mean[0], m_2_n[0], count[j], x_mean[j], m_2_n[j]); + } + + // release x_mean / m_2_n + auto mean_th = x_mean[0]; + auto m2_th = m_2_n[0]; + auto count_th = count[0]; + + // block-wise reduction with shared memory (since reduction cannot be done within a warp) + static __shared__ accscalar_t shmem_mean[MAX_BLOCK_SIZE]; + static __shared__ accscalar_t shmem_m2n[MAX_BLOCK_SIZE]; + static __shared__ int shmem_count[MAX_BLOCK_SIZE]; + + welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n); + + if (gridDim.y > 1) { + volatile accscalar_t* staging_mean = staging_data; + volatile accscalar_t* staging_m2n = &staging_data[stride*gridDim.y]; + volatile int* staging_count = reinterpret_cast(&staging_m2n[stride*gridDim.y]); + + address_base = c_offset + blockIdx.y * stride; + // write data to staging_data; + if (threadIdx.y == 0 && c_offset < stride) { + staging_mean[address_base] = mean_th; + staging_m2n[address_base] = m2_th; + staging_count[address_base] = count_th; + } + + __threadfence(); + __syncthreads(); // ensuring writes to staging_ is visible to all blocks + + __shared__ bool is_last_block_done; + // mark block done + if (threadIdx.x == 0 && threadIdx.y == 0) { + int old = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done = (old == (gridDim.y-1)); + } + + __syncthreads(); + + // check that all data is now available in global memory + if (is_last_block_done) { + count_th = 0; + mean_th = accscalar_t(0.0); + m2_th = accscalar_t(0.0); + + for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) { + address_base = c_offset + y * stride; + int count_new = c_offset < stride ? staging_count[address_base] : 0; + accscalar_t mean_new = c_offset < stride ? staging_mean[address_base] : accscalar_t(0.0); + accscalar_t m2n_new = c_offset < stride ? staging_m2n[address_base] : accscalar_t(0.0); + + welford_merge_element(count_th, mean_th, m2_th, count_new, mean_new, m2n_new); + } + + welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n); + if (threadIdx.y == 0 && c_offset < stride) { + out_mean[c_offset] = static_cast(mean_th); + out_invstd[c_offset] = VarTransform{}(m2_th/count_th, epsilon); + } + } + } else { + if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) { + out_mean[c_offset] = static_cast(mean_th); + out_invstd[c_offset] = VarTransform{}(m2_th/count_th, epsilon); + } + } +} + +// elementwise BN kernel +// original apex name: batchnorm_forward_c_last_kernel +template < + typename scalar_t, + typename accscalar_t, + typename layerscalar_t, + int PARALLEL_LOADS> +__global__ void batch_norm_transform_input_channels_last_kernel( + const scalar_t* __restrict__ input, + const scalar_t* __restrict__ z, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + const layerscalar_t* __restrict__ weight, + const layerscalar_t* __restrict__ shift, + scalar_t* __restrict__ out, + const int reduction_size, + const int stride, + const bool fuse_relu) { + // tensor dimension (m,c) + // loop along m dimension + int inner_loop_stride = blockDim.y * gridDim.y; + + // offset along m dimension + int m_offset = blockIdx.y * blockDim.y + threadIdx.y; + int c_offset = blockIdx.x * blockDim.x + threadIdx.x; + + if (c_offset >= stride || m_offset >= reduction_size) { + return; + } + + auto m_c = mean[c_offset]; + auto inv_std_c = static_cast(inv_std[c_offset]); + auto w_c = weight == nullptr ? accscalar_t(1.0) : static_cast(weight[c_offset]); + auto s_c = shift == nullptr ? accscalar_t(0.0) : static_cast(shift[c_offset]); + + int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS); + int address_base = m_offset * stride + c_offset; + int address_increment = inner_loop_stride * stride; + + for (int i = 0; i < loop_count; i++) { +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + if (c_offset < stride && m_offset < reduction_size) { + auto tmp = w_c * (static_cast(input[address_base]) - m_c ) * inv_std_c + s_c; + if (z != nullptr) { + tmp += z[address_base]; + } + out[address_base] = (fuse_relu && tmp <= accscalar_t(0.0) ? scalar_t(0.0) : static_cast(tmp)); + } + m_offset += inner_loop_stride; + address_base += address_increment; + } + } +} + +template +__device__ __forceinline__ void merge_block_vertical_backward(T& sum_dy, + T& sum_dy_xmu, + T* shmem_sum_dy, + T* shmem_sum_dy_xmu) { + // write to shared memory + auto address_base = threadIdx.x + threadIdx.y * blockDim.x; + +#pragma unroll + for (int offset = blockDim.y/2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset*2) { + shmem_sum_dy[address_base] = sum_dy; + shmem_sum_dy_xmu[address_base] = sum_dy_xmu; + } + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + auto address = address_base + offset * blockDim.x; + + sum_dy += shmem_sum_dy[address]; + sum_dy_xmu += shmem_sum_dy_xmu[address]; + } + } +} + +// batchnorm backward kernel for c last tensor +// original apex name: reduce_bn_c_last_kernel +template < + int PARALLEL_LOADS, + typename scalar_t, + typename accscalar_t, + typename layerscalar_t> +__global__ void batch_norm_backward_reduce_channels_last_kernel( + const scalar_t* __restrict__ input, + const scalar_t* __restrict__ grad_output, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + accscalar_t* __restrict__ sum_dy_o, + accscalar_t* __restrict__ sum_dy_xmu_o, + layerscalar_t* __restrict__ grad_weight, + layerscalar_t* __restrict__ grad_bias, + volatile accscalar_t* staging_data, + int* semaphores, + const int reduction_size, + const int stride) { + + // hide latency with concurrency + accscalar_t sum_dy[PARALLEL_LOADS]; + accscalar_t sum_dy_xmu[PARALLEL_LOADS]; + +#pragma unroll + for (int i = 0; i < PARALLEL_LOADS; i++) { + sum_dy[i] = accscalar_t(0); + sum_dy_xmu[i] = accscalar_t(0); + } + // tensor dimension (m,c) + + // loop along m dimension + int inner_loop_stride = blockDim.y * gridDim.y; + + // offset along m dimension + int m_offset = blockIdx.y * blockDim.y + threadIdx.y; + int c_offset = blockIdx.x * blockDim.x + threadIdx.x; + + if (c_offset >= stride || m_offset >= reduction_size) { + return; + } + + int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS); + int address_base = m_offset * stride + c_offset; + int address_increment = inner_loop_stride * stride; + + auto r_mean = mean[c_offset]; + auto factor = inv_std[c_offset]; + + for (int i = 0; i < loop_count; i++) { + accscalar_t x_input[PARALLEL_LOADS]; + accscalar_t x_grad_output[PARALLEL_LOADS]; + + // load multiple data in +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + if (c_offset < stride && m_offset < reduction_size) { + x_input[j] = input[address_base]; + x_grad_output[j] = grad_output[address_base]; + } else { + x_input[j] = accscalar_t(0); + x_grad_output[j] = accscalar_t(0); + } + m_offset += inner_loop_stride; + address_base += address_increment; + } + + // calculate sum_dy / sum_dy_xmu +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + sum_dy[j] += x_grad_output[j]; + sum_dy_xmu[j] += x_grad_output[j] * (x_input[j] - r_mean); + } + } + + // thread reduction to accumulate sum_dy / sum_dy_xmu between PARALLEL_LOADS +#pragma unroll + for (int j = 1; j < PARALLEL_LOADS; j++) { + sum_dy[0] += sum_dy[j]; + sum_dy_xmu[0] += sum_dy_xmu[j]; + } + + // release array of registers + auto sum_dy_th = sum_dy[0]; + auto sum_dy_xmu_th = sum_dy_xmu[0]; + + // block-wise reduction with shared memory (since reduction cannot be done within a warp) + static __shared__ accscalar_t shmem_sum_dy[MAX_BLOCK_SIZE]; + static __shared__ accscalar_t shmem_sum_dy_xmu[MAX_BLOCK_SIZE]; + + merge_block_vertical_backward(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu); + + if (gridDim.y > 1) { + volatile accscalar_t* staging_sum_dy = staging_data; + volatile accscalar_t* staging_sum_dy_xmu = &staging_data[stride*gridDim.y]; + + address_base = c_offset + blockIdx.y * stride; + // write data to staging_data; + if (threadIdx.y == 0 && c_offset < stride) { + staging_sum_dy[address_base] = sum_dy_th; + staging_sum_dy_xmu[address_base] = sum_dy_xmu_th; + } + + __threadfence(); + __syncthreads(); // ensuring writes to staging_ is visible to all blocks + + __shared__ bool is_last_block_done; + // mark block done + if (threadIdx.x == 0 && threadIdx.y == 0) { + int old = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done = (old == (gridDim.y-1)); + } + + __syncthreads(); + + // check that all data is now available in global memory + if (is_last_block_done) { + sum_dy_th = accscalar_t(0.0); + sum_dy_xmu_th = accscalar_t(0.0); + + for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) { + address_base = c_offset + y * stride; + sum_dy_th += (c_offset < stride ? staging_sum_dy[address_base] : accscalar_t(0.0)); + sum_dy_xmu_th += (c_offset < stride ? staging_sum_dy_xmu[address_base] : accscalar_t(0.0)); + } + + merge_block_vertical_backward(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu); + if (threadIdx.y == 0 && c_offset < stride) { + if (grad_bias != nullptr) { + grad_bias[c_offset] = static_cast(sum_dy_th); + } + if (grad_weight != nullptr) { + grad_weight[c_offset] = static_cast(sum_dy_xmu_th * factor); + } + //mean_dy[c_offset] = sum_dy_th / reduction_size; + //mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size; + sum_dy_o[c_offset] = sum_dy_th; + sum_dy_xmu_o[c_offset] = sum_dy_xmu_th; + } + } + } else { + if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) { + if (grad_bias != nullptr) { + grad_bias[c_offset] = static_cast(sum_dy_th); + } + if (grad_weight != nullptr) { + grad_weight[c_offset] = static_cast(sum_dy_xmu_th * factor); + } + //mean_dy[c_offset] = sum_dy_th / reduction_size; + //mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size; + sum_dy_o[c_offset] = sum_dy_th; + sum_dy_xmu_o[c_offset] = sum_dy_xmu_th; + } + } +} + +// elementwise BN kernel +// original apex name: batchnorm_backward_c_last_kernel +template < + int PARALLEL_LOADS, + typename scalar_t, + typename accscalar_t, + typename layerscalar_t> +__device__ __forceinline__ void batch_norm_backward_elemt_channels_last_kernel_impl( + const scalar_t* __restrict__ grad_output, + const scalar_t* __restrict__ input, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + const layerscalar_t* __restrict__ weight, + const accscalar_t* __restrict__ sum_dy, + const accscalar_t* __restrict__ sum_dy_xmu, + scalar_t* __restrict__ grad_input, + const accscalar_t norm_fct, + const int reduction_size, + const int stride) { + // tensor dimension (m,c) + // loop along m dimension + int inner_loop_stride = blockDim.y * gridDim.y; + + // offset along m dimension + int m_offset = blockIdx.y * blockDim.y + threadIdx.y; + int c_offset = blockIdx.x * blockDim.x + threadIdx.x; + + if (c_offset >= stride || m_offset >= reduction_size) { + return; + } + + auto m_c = mean[c_offset]; + auto m_dy_c = sum_dy[c_offset] * norm_fct; + auto factor_1_c = inv_std[c_offset]; + auto factor_2_c = (weight == nullptr? accscalar_t(1.0) : static_cast(weight[c_offset])) * factor_1_c; + factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[c_offset] * norm_fct; + + int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS); + int address_base = m_offset * stride + c_offset; + int address_increment = inner_loop_stride * stride; + + for (int i = 0; i < loop_count; i++) { +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + if (c_offset < stride && m_offset < reduction_size) { + grad_input[address_base] = static_cast( + (static_cast(grad_output[address_base]) - m_dy_c - + (static_cast(input[address_base]) - m_c) * factor_1_c) + * factor_2_c); + } + m_offset += inner_loop_stride; + address_base += address_increment; + } + } +} + +template < + int PARALLEL_LOADS, + typename scalar_t, + typename accscalar_t, + typename layerscalar_t> +__global__ void batch_norm_backward_elemt_channels_last_kernel( + const scalar_t* __restrict__ grad_output, + const scalar_t* __restrict__ input, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + const layerscalar_t* __restrict__ weight, + const accscalar_t* __restrict__ sum_dy, + const accscalar_t* __restrict__ sum_dy_xmu, + const int* __restrict__ numel, + scalar_t* __restrict__ grad_input, + const int64_t world_size, + const int reduction_size, + const int stride) { + + int64_t total_numel = 0; + for (int i = 0; i < world_size; i++) { + total_numel += numel[i]; + } + + auto norm_fct = static_cast(1) / static_cast(total_numel); + batch_norm_backward_elemt_channels_last_kernel_impl( + grad_output, input, mean, inv_std, weight, sum_dy, sum_dy_xmu, + grad_input, norm_fct, reduction_size, stride); +} + +template < + int PARALLEL_LOADS, + typename scalar_t, + typename accscalar_t, + typename layerscalar_t> +__global__ void batch_norm_backward_elemt_channels_last_kernel( + const scalar_t* __restrict__ grad_output, + const scalar_t* __restrict__ input, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + const layerscalar_t* __restrict__ weight, + const accscalar_t* __restrict__ sum_dy, + const accscalar_t* __restrict__ sum_dy_xmu, + scalar_t* __restrict__ grad_input, + const accscalar_t norm_fct, + const int reduction_size, + const int stride) { + batch_norm_backward_elemt_channels_last_kernel_impl( + grad_output, input, mean, inv_std, weight, sum_dy, sum_dy_xmu, + grad_input, norm_fct, reduction_size, stride); +} + +template +void batch_norm_stats_channels_last_cuda_template( + const Tensor& out_mean, const Tensor& out_invstd, const Tensor& input, double epsilon) { + using accscalar_t = at::acc_type; + + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + + resize_output(out_mean, {stride}); + resize_output(out_invstd, {stride}); + TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() && + out_invstd.sizes()[0]); + TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() && + out_mean.sizes()[0]); + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid, true); + + at::Tensor staging_data; + at::Tensor semaphores; + if (grid.y > 1) { + staging_data = at::empty({4*stride*grid.y}, out_mean.options()); + semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt)); + } + + accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr() : nullptr; + int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr() : nullptr; + batch_norm_collect_statistics_channels_last_kernel + <<>>( + input.const_data_ptr(), + out_mean.mutable_data_ptr(), + out_invstd.mutable_data_ptr(), + staging_data_ptr, + semaphores_ptr, + reduction_size, + stride, + epsilon); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +void batch_norm_elemt_channels_last_cuda_template( + const at::Tensor& output, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& shift, // bias of BN + const at::Tensor& mean, + const at::Tensor& inv_std, + const std::optional& z = std::nullopt, // bias after BN + const bool fuse_relu = false) { + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid); + + auto stream = at::cuda::getCurrentCUDAStream(); + const auto second_dtype = weight.defined() ? weight.scalar_type() : + (shift.defined() ? shift.scalar_type() : input.scalar_type()); + + if (input.scalar_type() != second_dtype) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward", [&] { + using accscalar_t = at::acc_type; + batch_norm_transform_input_channels_last_kernel + <<>>( + input.const_data_ptr(), + z.has_value() ? z.value().const_data_ptr() : nullptr, + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.defined() ? weight.const_data_ptr() : nullptr, + shift.defined() ? shift.const_data_ptr() : nullptr, + output.mutable_data_ptr(), + reduction_size, + stride, + fuse_relu); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } else { + if (weight.defined()){ + TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_forward: input.scalar_type() ", input.scalar_type(), + " is not supported with weight.scalar_type() ", weight.scalar_type()); + } + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward", [&] { + using accscalar_t = at::acc_type; + batch_norm_transform_input_channels_last_kernel + <<>>( + input.const_data_ptr(), + z.has_value() ? z.value().const_data_ptr() : nullptr, + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.defined() ? weight.const_data_ptr() : nullptr, + shift.defined() ? shift.const_data_ptr(): nullptr, + output.mutable_data_ptr(), + reduction_size, + stride, + fuse_relu); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } +} + +std::tuple +batch_norm_backward_reduce_cuda_channels_last_template(const at::Tensor& grad_output, + const at::Tensor& input, + const at::Tensor& mean, + const at::Tensor& inv_std, + const at::Tensor& weight, + const bool input_g, const bool weight_g, const bool bias_g) { + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + + at::Tensor sumn_dy = at::empty({stride}, mean.options()); + at::Tensor sum_dy_xmu = at::empty({stride}, mean.options()); + + at::Tensor grad_weight; + at::Tensor grad_bias; + if (weight.defined()) { + grad_weight = at::empty({stride}, weight.options()); + grad_bias = at::empty({stride}, weight.options()); + } else { + // because I cannot return an uninitialized at::Tensor + grad_weight = at::empty({0}, mean.options()); + grad_bias = at::empty({0}, mean.options()); + } + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid, true); + + at::Tensor staging_data; + at::Tensor semaphores; + if (grid.y > 1) { + staging_data = at::empty({2*stride*grid.y}, mean.options()); + semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt)); + } + auto stream = at::cuda::getCurrentCUDAStream(); + + if (weight.defined() && input.scalar_type() != weight.scalar_type()) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_reduce", [&] { + using accscalar_t = at::acc_type; + accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr() : nullptr; + int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr() : nullptr; + batch_norm_backward_reduce_channels_last_kernel + <<>>( + input.const_data_ptr(), + grad_output.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + sumn_dy.mutable_data_ptr(), + sum_dy_xmu.mutable_data_ptr(), + grad_weight.mutable_data_ptr(), + grad_bias.mutable_data_ptr(), + staging_data_ptr, + semaphores_ptr, + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } else { + if (weight.defined()) { + TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_backward_reduce: input.scalar_type() ", input.scalar_type(), + " is not supported with weight.scalar_type() ", weight.scalar_type()); + } + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_reduce", [&] { + using accscalar_t = at::acc_type; + accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr() : nullptr; + int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr() : nullptr; + batch_norm_backward_reduce_channels_last_kernel + <<>>( + input.const_data_ptr(), + grad_output.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + sumn_dy.mutable_data_ptr(), + sum_dy_xmu.mutable_data_ptr(), + weight.defined() ? grad_weight.mutable_data_ptr() : nullptr, + weight.defined() ? grad_bias.mutable_data_ptr() : nullptr, + staging_data_ptr, + semaphores_ptr, + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } + + return std::make_tuple(sumn_dy, sum_dy_xmu, grad_weight, grad_bias); +} + +at::Tensor batch_norm_backward_elemt_channels_last_cuda_template( + const at::Tensor& grad_output, + const at::Tensor& input, + const at::Tensor& mean, + const at::Tensor& inv_std, + const at::Tensor& weight, + const at::Tensor& sum_dy, + const at::Tensor& sum_dy_xmu, + const at::Tensor& count) { + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + + // Input is guaranteed to be channels-last compatible + at::Tensor grad_input = at::empty_like(input); + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid); + + auto stream = at::cuda::getCurrentCUDAStream(); + + if (weight.defined() && weight.scalar_type() != input.scalar_type()) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_element", [&] { + using accscalar_t = at::acc_type; + batch_norm_backward_elemt_channels_last_kernel + <<>>( + grad_output.const_data_ptr(), + input.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.const_data_ptr(), + sum_dy.const_data_ptr(), + sum_dy_xmu.const_data_ptr(), + count.const_data_ptr(), + grad_input.mutable_data_ptr(), + count.numel(), + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } else { + if (weight.defined()) { + TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_backward_element: input.scalar_type() ", input.scalar_type(), + " is not supported with weight.scalar_type() ", weight.scalar_type()); + } + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "batchnorm_backward_element", [&] { + using accscalar_t = at::acc_type; + batch_norm_backward_elemt_channels_last_kernel + <<>>( + grad_output.const_data_ptr(), + input.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.defined() ? weight.const_data_ptr() : nullptr, + sum_dy.const_data_ptr(), + sum_dy_xmu.const_data_ptr(), + count.const_data_ptr(), + grad_input.mutable_data_ptr(), + count.numel(), + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } + + return grad_input; +} + +at::Tensor batch_norm_backward_elemt_channels_last_cuda_template( + const at::Tensor& grad_output, + const at::Tensor& input, + const at::Tensor& mean, + const at::Tensor& inv_std, + const at::Tensor& weight, + const at::Tensor& sum_dy, + const at::Tensor& sum_dy_xmu) { + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + auto norm_fct = 1.0 / reduction_size; + + // Input is guaranteed to be channels-last compatible + at::Tensor grad_input = at::empty_like(input); + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid); + + auto stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_element", [&] { + using accscalar_t = at::acc_type; + + if (weight.defined() && weight.scalar_type() != input.scalar_type()) { + batch_norm_backward_elemt_channels_last_kernel + <<>>( + grad_output.const_data_ptr(), + input.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.const_data_ptr(), + sum_dy.const_data_ptr(), + sum_dy_xmu.const_data_ptr(), + grad_input.mutable_data_ptr(), + static_cast(norm_fct), + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + batch_norm_backward_elemt_channels_last_kernel + <<>>( + grad_output.const_data_ptr(), + input.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.defined() ? weight.const_data_ptr() : nullptr, + sum_dy.const_data_ptr(), + sum_dy_xmu.const_data_ptr(), + grad_input.mutable_data_ptr(), + static_cast(norm_fct), + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } + }); + + return grad_input; +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c98379b40d11e0a293bf8b13bc547321eaebc901 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh @@ -0,0 +1,407 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace { + +int log2_ceil(int value) { + int log2_value = 0; + while ((1 << log2_value) < value) ++log2_value; + return log2_value; +} + +template +struct Add { + __device__ __forceinline__ T operator()(T a, T b) const { + return a + b; + } +}; + +template +struct Max { + __device__ __forceinline__ T operator()(T a, T b) const { + return a < b ? b : a; + } +}; + +template class ReduceOp> +__device__ __forceinline__ void warp_reduce(acc_t* sum) { + ReduceOp r; + #pragma unroll + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + acc_t b = WARP_SHFL_XOR(sum[i], offset, WARP_SIZE); + sum[i] = r(sum[i], b); + } + } +} + +// The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension. +// Each sample contains element_count scalar elements. element_count can be any integer value <= 1024. +// The template arguments have the following meaning: +// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples. +// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small. +// A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp. +// This is important because it means only __shfl_ instructions are required for reductions. +// Note that this means WARP_SIZE must be a power of two and <= architecture warp size. +// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch. +// ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs. +// is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed. +// is_masked is a flag indicating whether SoftMax or MaskedSoftMax should be computed. +// The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t. +// This allows SoftMax to be fused with a cast immediately following the SoftMax. +// The mask should have the same shape as input, with a boolean indicate if the value is masked. +// The head_chunk_size is only used for transformer mask softmax, equals to H * D * D. +// For instance: +// input_t=half, acc_t=float, output_t=half => read half tensor, float accumulators, write half tensor. +// input_t=half, acc_t=float, output_t=float => read half tensor, float accumulators, write float tensor. +// input_t_float, acc_t=float, output_t=half => read float tensor, float accumulators, write half tensor. + +template +__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count, const bool *mask = nullptr, const int head_chunk_size = -1, bool is_transformer_mask = false) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + + // batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x; + int idx_offset = first_batch * stride + local_idx; + + src += idx_offset; + dst += idx_offset; + + if (is_transformer_mask) { + mask += ((first_batch * stride) / head_chunk_size) * stride + local_idx; + } else { + mask += idx_offset; + } + // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop, + // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep + // the nested loops. + // This should have no impact on performance because the loops are unrolled anyway. + + // load data from global memory + acc_t elements[WARP_BATCH][WARP_ITERATIONS]; + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + elements[i][it] = src[i*element_count+it*WARP_SIZE]; + } else { + elements[i][it] = -std::numeric_limits::infinity(); + } + } + } + + // compute max_value + acc_t max_value[WARP_BATCH]; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + bool is_meaningful_max = false; + max_value[i] = elements[i][0]; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (is_masked) { + int idx = it*WARP_SIZE; + if ((idx + local_idx) < batch_element_count) { + if (!is_transformer_mask) { + idx += i*element_count; + } + if (!mask[idx]) { + max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it]; + is_meaningful_max = true; + } + } + } else { + max_value[i] = max_value[i] > elements[i][it] ? max_value[i] : elements[i][it]; + } + } + if (is_masked) { + if (!is_meaningful_max) { + max_value[i] = -std::numeric_limits::infinity(); + } + } + } + warp_reduce(max_value); + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (!is_masked) { + if (is_log_softmax) { + sum[i] += std::exp(elements[i][it] - max_value[i]); + } else { + elements[i][it] = std::exp(elements[i][it] - max_value[i]); + sum[i] += elements[i][it]; + } + } else { + int idx = it*WARP_SIZE; + bool valid = (idx + local_idx) < batch_element_count; + if (!is_transformer_mask) { + idx += i*element_count; + } + if (valid) { + if (!mask[idx]) { + if (is_log_softmax) { + sum[i] += std::exp(elements[i][it] - max_value[i]); + } else { + elements[i][it] = std::exp(elements[i][it] - max_value[i]); + sum[i] += elements[i][it]; + } + } else { + if (!is_log_softmax) { + // Masked values are treated as -infinity, and std::exp(-infinity) is 0. + elements[i][it] = 0; + } + } + } else { + if (!is_log_softmax) { + elements[i][it] = 0.; + } + } + } + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + if (is_log_softmax) sum[i] = std::log(sum[i]); + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < element_count) { + if (is_log_softmax) { + dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i]; + } else if (sum[i] == 0) { + dst[i*element_count+it*WARP_SIZE] = std::numeric_limits::quiet_NaN(); + } else { + dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i]; + } + } else { + break; + } + } + } +} + +template +__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr) +{ + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel. + constexpr int next_power_of_two = 1 << log2_elements; + constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; + constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE; + constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1; + + int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; + + // batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = batch_size - first_batch; + if (local_batches > WARP_BATCH) + local_batches = WARP_BATCH; + + // there might be multiple batches per warp. compute the index within the batch + int local_idx = threadIdx.x % WARP_SIZE; + + // the first element to process by the current thread + int thread_offset = first_batch * stride + local_idx; + grad += thread_offset; + output += thread_offset; + gradInput += thread_offset; + if (is_masked) { + mask += thread_offset; + } + + // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop, + // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep + // the nested loops. + // This should have no impact on performance because the loops are unrolled anyway. + + // load data from global memory + acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS]; + acc_t output_reg[WARP_BATCH][WARP_ITERATIONS]; + for (int i = 0; i < WARP_BATCH; ++i) { + int batch_element_count = (i >= local_batches) ? 0 : element_count; + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < batch_element_count) { + grad_reg[i][it] = grad[i*element_count+it*WARP_SIZE]; + output_reg[i][it] = output[i*element_count+it*WARP_SIZE]; + } else { + grad_reg[i][it] = acc_t(0); + output_reg[i][it] = acc_t(0); + } + } + } + + acc_t sum[WARP_BATCH] { 0.0f }; + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) { + sum[i] += grad_reg[i][it]; + } + } + } + warp_reduce(sum); + + // store result + #pragma unroll + for (int i = 0; i < WARP_BATCH; ++i) { + if (i >= local_batches) + break; + #pragma unroll + for (int it = 0; it < WARP_ITERATIONS; ++it) { + int element_index = local_idx + it * WARP_SIZE; + if (element_index < element_count) { + if (is_masked && mask[i*element_count+it*WARP_SIZE]) { + gradInput[i*element_count+it*WARP_SIZE] = 0; + } + // compute gradients + else if (is_log_softmax) { + gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]); + } else { + gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]); + } + } + } + } +} + +} // end of anonymous namespace + +template +void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr, int chunk_size = -1, bool is_transformer_mask = false) +{ + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward. + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (batch_count + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E: \ + softmax_warp_forward \ + <<>>(dst, \ + src, batch_count, softmax_elements_stride, softmax_elements, mask, chunk_size, is_transformer_mask); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_FORWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_FORWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_FORWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_FORWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_FORWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_FORWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_FORWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_FORWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_FORWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_FORWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_FORWARD(10); // 1024 + LAUNCH_SOFTMAX_WARP_FORWARD(11); // 2048 + default: + break; + } + } +} + +template +void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr) +{ + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 ); + if (softmax_elements == 0) { + return; + } else { + int log2_elements = log2_ceil(softmax_elements); + const int next_power_of_two = 1 << log2_elements; + + // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward. + int warp_size = at::cuda::warp_size(); + warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size; + + // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward. + int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; + + // use 128 threads per block to maximize gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + int blocks = (batch_count + batches_per_block - 1) / batches_per_block; + dim3 threads(warp_size, warps_per_block, 1); + // Launch code would be more elegant if C++ supported FOR CONSTEXPR + switch (log2_elements) { + #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E: \ + softmax_warp_backward \ + <<>> \ + (grad_input, grad, output, batch_count, softmax_elements_stride, \ + softmax_elements, mask); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); \ + break; + + LAUNCH_SOFTMAX_WARP_BACKWARD(0); // 1 + LAUNCH_SOFTMAX_WARP_BACKWARD(1); // 2 + LAUNCH_SOFTMAX_WARP_BACKWARD(2); // 4 + LAUNCH_SOFTMAX_WARP_BACKWARD(3); // 8 + LAUNCH_SOFTMAX_WARP_BACKWARD(4); // 16 + LAUNCH_SOFTMAX_WARP_BACKWARD(5); // 32 + LAUNCH_SOFTMAX_WARP_BACKWARD(6); // 64 + LAUNCH_SOFTMAX_WARP_BACKWARD(7); // 128 + LAUNCH_SOFTMAX_WARP_BACKWARD(8); // 256 + LAUNCH_SOFTMAX_WARP_BACKWARD(9); // 512 + LAUNCH_SOFTMAX_WARP_BACKWARD(10); // 1024 + default: + break; + } + } +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Pow.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Pow.cuh new file mode 100644 index 0000000000000000000000000000000000000000..8ee3cd13337f9abb3ddf2d8a516ebaff9e458b47 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Pow.cuh @@ -0,0 +1,63 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +namespace { + + +// SFINAE doesn't work well with NVCC under Windows for math functions like pow and sqrt. +// So we need to define the functions with the explicit function signatures. +// As for pow, the following signatures are defined as the device function: +// pow(float, int) +// pow(double, int) +// pow(float, float) +// pow(double, double) +#if defined(_MSC_VER) || defined(_LIBCPP_VERSION) +// Functions for pow +// pow for at::Half +static inline __host__ __device__ at::Half pow_(at::Half base, at::Half exp) { + return static_cast(std::pow(static_cast(base), static_cast(exp))); +} +// pow for at::BFloat16 +static inline __host__ __device__ at::BFloat16 pow_(at::BFloat16 base, at::BFloat16 exp) { + return static_cast(std::pow(static_cast(base), static_cast(exp))); +} +// pow (floating, floating/int) +template +static inline __host__ __device__ typename std::enable_if_t && (std::is_same_v || std::is_same_v), Base_type> + pow_(Base_type base, Exp_type exp) { + return std::pow(base, exp); +} +// pow (Otherwise) +template +static inline __host__ __device__ typename std::enable_if_t && !std::is_same_v, Base_type> + pow_(Base_type base, Exp_type exp) { + return static_cast(std::pow(static_cast(base), static_cast(exp))); +} +#else +template +static inline __host__ __device__ Base_type pow_(Base_type base, Exp_type exp) { + return ::pow(base, exp); +} +#endif + +template +static inline __host__ __device__ std::enable_if_t, T> pow_( + T base, T exp) { + return at::native::powi(base, exp); +} + +template +static inline __host__ __device__ c10::complex pow_(c10::complex base, c10::complex exp) { + return c10_complex_math::pow(base, exp); +} + +} // namespace +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Randperm.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Randperm.cuh new file mode 100644 index 0000000000000000000000000000000000000000..8ac62c82fd0f8d1277f437311483cd0b64d0ad09 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Randperm.cuh @@ -0,0 +1,63 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include + +#include +#include +#include + +namespace { + +// See note [Algorithm of randperm] +template +__global__ void randperm_handle_duplicate_keys_kernel(T *keys, scalar_t *data, T mask, int n, at::PhiloxCudaState philox_args) { + int tid = threadIdx.x + blockDim.x * blockIdx.x; + + // find the beginning of islands + if (tid >= n - 1) return; // out of range + if ((keys[tid] & mask) != (keys[tid + 1] & mask)) return; // not in an island + if (tid != 0 && (keys[tid] & mask) == (keys[tid - 1] & mask)) return; // not the beginning of an island + + // find the size of islands + int island_size = 0; + do { island_size++; } + while ((tid + island_size < n) && (keys[tid + island_size] & mask) == (keys[tid] & mask)); + + // do random permutation inside each island. + data += tid; + const auto [seed, offset] = at::cuda::philox::unpack(philox_args); + curandStatePhilox4_32_10_t state; + curand_init(seed, tid, offset, &state); + for (int i = island_size - 1; i > 0; i--) { + unsigned int r = curand(&state) % (i + 1); + if (i != r) { + scalar_t tmp = data[i]; + data[i] = data[r]; + data[r] = tmp; + } + } +} + +// See note [Algorithm of randperm] +template +void randperm_handle_duplicate_keys(T *keys, scalar_t *data, int bits, int64_t n, std::optional &gen_) { + auto gen = at::get_generator_or_default(gen_, at::cuda::detail::getDefaultCUDAGenerator()); + int64_t counter_offset = n; + at::PhiloxCudaState rng_engine_inputs; + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + rng_engine_inputs = gen->philox_cuda_state(counter_offset); + } + T mask = static_cast((1UL << bits) - 1); + randperm_handle_duplicate_keys_kernel<<<(n + 511) / 512, 512, 0, at::cuda::getCurrentCUDAStream()>>>( + keys, data, mask, n, rng_engine_inputs); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Reduce.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Reduce.cuh new file mode 100644 index 0000000000000000000000000000000000000000..78104872731ac756aae3460d9852c345370f00d4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Reduce.cuh @@ -0,0 +1,1450 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at::native { + +static inline int64_t div_up(int64_t a, int64_t b) { + return (a + b - 1) / b; +} + +// returns floor(log2(n)) +static inline int last_pow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +// returns reduced fraction numerator & denominator +C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { + // get GCD of num and denom using Euclid's algorithm. + // Can replace this with std::gcd if we ever support c++17. + size_t a = denominator; + size_t b = numerator; + while (b != 0) { + a %= b; + // swap(a,b) + size_t tmp = a; + a = b; + b = tmp; + } + + // a is now the GCD + numerator /= a; + denominator /= a; +} + +//template for changing MAX_NUM_THREADS based on op dtype +template +struct mnt_wrapper { + static constexpr int MAX_NUM_THREADS = 512; +}; + +template <> +struct mnt_wrapper >{ + static constexpr int MAX_NUM_THREADS = 256; +}; + +constexpr int max_reduce_threads(c10::ScalarType type) { + return type == kComplexDouble ? 256 : 512; +} + +struct ReduceConfig { + static constexpr int BLOCK_X = 0; + static constexpr int BLOCK_Y = 1; + static constexpr int CTA = 2; + + ReduceConfig(int element_size_bytes, int num_outputs, int num_inputs) + : element_size_bytes(element_size_bytes) + , num_inputs(num_inputs) + , num_outputs(num_outputs) {} + int element_size_bytes; + int num_inputs; + int num_outputs; + int step_input = 1; + int step_output = 1; + int ctas_per_output = 1; + int input_mult[3] = {0, 0, 0}; + int output_mult[2] = {0, 0}; + + int block_width; + int block_height; + int num_threads; + + bool vectorize_input = false; + int output_vec_size = 1; + + template + void set_block_dimension(int64_t dim0, int64_t dim1) { + const int max_num_threads = mnt_wrapper::MAX_NUM_THREADS / output_vec_size; + int dim0_pow2 = dim0 < max_num_threads ? static_cast(last_pow2(dim0)) : max_num_threads; + int dim1_pow2 = dim1 < max_num_threads ? static_cast(last_pow2(dim1)) : max_num_threads; + block_width = std::min(dim0_pow2, int(at::cuda::warp_size())); + block_height = std::min(dim1_pow2, int(max_num_threads / block_width)); + block_width = std::min(dim0_pow2, int(max_num_threads / block_height)); + num_threads = block_width * block_height; + } + + int split_input(int parallelism) { + int step = step_input; + step_input *= parallelism; + return step; + } + + int split_output(int parallelism) { + int step = step_output; + step_output *= parallelism; + return step; + } + + dim3 block() const { + return dim3(block_width, block_height); + } + + dim3 grid() const { + return dim3(div_up(num_outputs / output_vec_size, step_output), ctas_per_output); + } + + C10_HOST_DEVICE bool should_block_x_reduce() const { + return input_mult[BLOCK_X] != 0; + } + + C10_HOST_DEVICE bool should_block_y_reduce() const { + return input_mult[BLOCK_Y] != 0; + } + + C10_HOST_DEVICE bool should_global_reduce() const { + return input_mult[CTA] != 0; + } + + C10_DEVICE bool should_store(int output_idx) const { + return output_idx < num_outputs && + (!should_block_x_reduce() || threadIdx.x == 0) && + (!should_block_y_reduce() || threadIdx.y == 0); + } + + C10_DEVICE bool should_reduce_tail() const { + return (!should_block_y_reduce() || threadIdx.y == 0) && + (!should_global_reduce() || blockIdx.y == 0); + } + + C10_HOST_DEVICE int input_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta2 = blockIdx.y; + return (lane * input_mult[BLOCK_X] + + warp * input_mult[BLOCK_Y] + + cta2 * input_mult[CTA]); + } + + template + C10_HOST_DEVICE int output_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta1 = blockIdx.x; + return (lane * output_mult[BLOCK_X] + + warp * output_mult[BLOCK_Y] + + cta1 * step_output) * output_vec_size; + } + + C10_DEVICE int shared_memory_offset(int offset) const { + return threadIdx.x + (threadIdx.y + offset) * blockDim.x; + } + + C10_DEVICE int staging_memory_offset(int cta2) const { + int offset = cta2 + blockIdx.x * gridDim.y; + if (!should_block_x_reduce()) { + offset = threadIdx.x + offset * blockDim.x; + } + return offset; + } + + int shared_memory_size() const { + if (!should_block_y_reduce() && + (!should_block_x_reduce() || + block_width <= at::cuda::warp_size())) { + return 0; + } + return element_size_bytes * num_threads * output_vec_size; + } + + int64_t global_memory_size() const { + if (!should_global_reduce()) { + return 0; + } + auto size = (int64_t)element_size_bytes * num_outputs * ctas_per_output; + if (!should_block_x_reduce()) { + size *= block().x * output_vec_size; + } + return size; + } + + int semaphore_size() const { + if (!should_global_reduce()) { + return 0; + } + return sizeof(int) * grid().x; + } + + int values_per_thread() const { + return div_up(num_inputs, step_input); + } + + int mock_values_per_thread(int parallelism) { + return div_up(num_inputs, step_input * parallelism); + } +}; + +std::ostream& operator<<(std::ostream& out, const ReduceConfig& config); + +template +C10_LAUNCH_BOUNDS_2(nt, 4) +__global__ void reduce_kernel(R reduction) { + reduction.template run(); +} + +template +static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& iter) { + int num_reduce_dims = iter.num_reduce_dims(); + int num_output_dims = iter.ndim() - num_reduce_dims; + int input_index = iter.ntensors() - 1; + int output_index = 0; + std::array strides = { + iter.strides(output_index).data() + num_reduce_dims, + iter.strides(input_index).data() + num_reduce_dims, + }; + auto shape = iter.shape().data() + num_reduce_dims; + return OffsetCalculator<2, index_t>(num_output_dims, shape, strides.data()); +} + +template +static OffsetCalculator<1, index_t> make_input_calculator(const TensorIterator& iter) { + int num_reduce_dims = iter.num_reduce_dims(); + int input_index = iter.ntensors() - 1; + std::array strides = { + iter.strides(input_index).data(), + }; + return OffsetCalculator<1, index_t>(num_reduce_dims, iter.shape().data(), strides.data()); +} + +template +struct func_wrapper_t { + using arg_t = typename binary_function_traits::arg1_t; + using scalar_t = typename binary_function_traits::arg2_t; + + func_t combine; + static inline __device__ out_scalar_t project(arg_t arg) { + return (out_scalar_t) arg; + } + static inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) { + return WARP_SHFL_DOWN(arg, offset); + } + + static __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) { + return acc; + } + + func_wrapper_t(const func_t& op) : combine(op) { + } + + // wrap a normal reduction that ignores the index + __device__ arg_t reduce(arg_t acc, scalar_t val, int64_t idx) const { + return combine(acc, val); + } +}; + +template +func_wrapper_t func_wrapper(const func_t& op) { + return func_wrapper_t { op }; +} + +template +struct ReduceJitOp { +//ReduceJitOp is almost like ReduceOp, but it doesn't have ops functor that specifies reduction operations +//Maybe we can find a way to unify ReduceOp and ReduceJitOp + using InputCalculator = OffsetCalculator<1, uint32_t>; + using OutputCalculator = OffsetCalculator<2, uint32_t>; + //TODO for now arg_t is always opmath_t of the input, later we'll need to change it + using arg_t = at::opmath_type; + + //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor, + //not just wrapper + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + ReduceJitOp( + ReduceConfig config, + InputCalculator input_calc, + OutputCalculator output_calc, + const void* src, + char* dst0, + std::optional dst1, + void* acc_buf, + void* cta_buf, + int* semaphores, + arg_t ident, + int noutputs, + int64_t base_idx) + : ident(ident), + config(config), + input_calc(input_calc), + output_calc(output_calc), + src(src), + acc_buf(acc_buf), + cta_buf(cta_buf), + semaphores(semaphores), + base_idx(base_idx), + noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } + } +}; + +template +struct ReduceOp { + using traits = function_traits; + using arg_t = typename std::decay::type>::type; + + using InputCalculator = OffsetCalculator<1, index_t>; + using OutputCalculator = OffsetCalculator<2, index_t>; + + static constexpr bool can_accumulate_in_output = + std::is_convertible_v + && std::is_convertible_v; + + ops_t ops; + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + ReduceOp( + ops_t ops, + ReduceConfig config, + InputCalculator input_calc, + OutputCalculator output_calc, + const void* src, + char* dst0, + std::optional dst1, + void* acc_buf, + void* cta_buf, + int* semaphores, + arg_t ident, + int noutputs, + int64_t base_idx) + : ops(ops), + ident(ident), + config(config), + input_calc(input_calc), + output_calc(output_calc), + src(src), + acc_buf(acc_buf), + cta_buf(cta_buf), + semaphores(semaphores), + base_idx(base_idx), + noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } + } + + template + C10_DEVICE void run() const { + extern __shared__ char shared_memory[]; + index_t output_idx = config.output_idx(); + index_t input_idx = config.input_idx(); + auto base_offsets1 = output_calc.get(output_idx)[1]; + + using arg_vec_t = std::array; + arg_vec_t value; + + if (output_idx < config.num_outputs && input_idx < config.num_inputs) { + const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1); + value = thread_reduce(input_slice); + } + + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (config.should_block_y_reduce()) { + value = block_y_reduce(value, shared_memory); + } + using out_ptr_vec_t = std::array; + using offset_vec_t = std::array; + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + arg_vec_t* acc = nullptr; + if (acc_buf != nullptr) { + size_t numerator = sizeof(arg_t); + size_t denominator = sizeof(out_scalar_t); + reduce_fraction(numerator, denominator); + acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator)); + } + + if (config.should_global_reduce()) { + value = global_reduce(value, acc, shared_memory); + } else if (config.should_store(output_idx)) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + template + C10_DEVICE std::array thread_reduce(const scalar_t* data) const { + if (config.vectorize_input) { + CUDA_KERNEL_ASSERT(output_vec_size == 1); + // reduce at the header of input_slice where memory is not aligned, + // so that thread_reduce will have an aligned memory to work on. + return {input_vectorized_thread_reduce_impl(data)}; + } else { + index_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t); + bool is_contiguous = (input_calc.dims == 1 && element_stride == 1); + if (is_contiguous) { + return thread_reduce_impl(data, [](index_t idx) { return idx; }); + } else if (input_calc.dims == 1) { + return thread_reduce_impl(data, [&](index_t idx) { return idx * element_stride; }); + } else { + return thread_reduce_impl(data, [&](index_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); }); + } + } + } + + C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { + index_t end = config.num_inputs; + + // Handle the head of input slice where data is not aligned + arg_t value = ident; + constexpr int align_bytes = alignof(at::native::memory::aligned_vector); + constexpr int align_elements = align_bytes / sizeof(scalar_t); + int shift = ((uint64_t)data) % align_bytes / sizeof(scalar_t); + if (shift > 0) { + data -= shift; + end += shift; + if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){ + value = ops.reduce(value, c10::load(data + threadIdx.x), threadIdx.x - shift); + } + end -= align_elements; + data += align_elements; + shift = align_elements - shift; + } + + // Do the vectorized reduction + using load_t = at::native::memory::aligned_vector; + + index_t idx = config.input_idx(); + const index_t stride = config.step_input; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_t value_list[input_vec_size]; + value_list[0] = value; + + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[i] = ident; + } + + while (idx * input_vec_size + input_vec_size - 1 < end) { + const auto values_vec = memory::load_vector(data, idx); + #pragma unroll + for (index_t i = 0; i < input_vec_size; i++) { + value_list[i] = ops.reduce(value_list[i], values_vec.val[i], shift + idx * input_vec_size + i); + } + idx += stride; + } + + // tail + index_t tail_start = end - end % input_vec_size; + if (config.should_reduce_tail()) { + int idx = tail_start + threadIdx.x; + if (idx < end) { + const auto value = c10::load(data + idx); + value_list[0] = ops.reduce(value_list[0], value, idx + shift); + } + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[0] = ops.combine(value_list[0], value_list[i]); + } + return value_list[0]; + } + + template + C10_DEVICE std::array thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const { + index_t idx = config.input_idx(); + const index_t end = config.num_inputs; + const index_t stride = config.step_input; + + using arg_vec_t = std::array; + using load_t = at::native::memory::aligned_vector; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_vec_t value_list[vt0]; + + #pragma unroll + for (int i = 0; i < vt0; i++) { + #pragma unroll + for (int j = 0; j < output_vec_size; j++) { + value_list[i][j] = ident; + } + } + + load_t values[vt0]; + + while (idx + (vt0 - 1) * stride < end) { + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + const auto offset = calc(idx + i * stride) / output_vec_size; + values[i] = memory::load_vector(data_, offset); + } + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx + i * stride); + } + } + idx += stride * vt0; + } + + // tail + int idx_ = idx; + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + const auto offset = calc(idx) / output_vec_size; + values[i] = memory::load_vector(data_, offset); + idx += stride; + } + idx = idx_; + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx); + } + idx += stride; + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < vt0; i++) { + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[0][j] = ops.combine(value_list[0][j], value_list[i][j]); + } + } + return value_list[0]; + } + + template + C10_DEVICE std::array block_x_reduce(std::array value, char* shared_memory) const { + using args_vec_t = std::array; + int dim_x = blockDim.x; + args_vec_t* shared = (args_vec_t*)shared_memory; + if (dim_x > warpSize) { + int address_base = threadIdx.x + threadIdx.y*blockDim.x; + shared[address_base] = value; + for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) { + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { + args_vec_t other = shared[address_base + offset]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], other[i]); + } + shared[address_base] = value; + } + } + dim_x = warpSize; + } + + __syncthreads(); + // Intra-warp reduction, fix CUDA to have offset decreasing for better numerics + // matching Triton, etc. + // TODO(PaulZhang12): AMD and internal + #if defined(USE_ROCM) || defined(FBCODE_CAFFE2) + for (int offset = 1; offset < dim_x; offset <<= 1) { + #else + for (int offset = dim_x >> 1; offset > 0; offset >>= 1) { + #endif + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + arg_t other = ops.warp_shfl_down(value[i], offset); + value[i] = ops.combine(value[i], other); + } + } + return value; + } + + template + C10_DEVICE std::array block_y_reduce(std::array value, char* shared_memory) const { + using args_vec_t = std::array; + args_vec_t* shared = (args_vec_t*)shared_memory; + shared[config.shared_memory_offset(0)] = value; + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + args_vec_t other = shared[config.shared_memory_offset(offset)]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], other[i]); + } + shared[config.shared_memory_offset(0)] = value; + } + } + return value; + } + + C10_DEVICE bool mark_block_finished() const { + __shared__ bool is_last_block_done_shared; + + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1); + } + + __syncthreads(); + + return is_last_block_done_shared; + } + + template + C10_DEVICE std::array accumulate_in_output( + std::array out, + std::array value, + typename std::enable_if_t* = nullptr + ) const { + std::array ret; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + ret[i] = ops.combine(*(out[i]), value[i]); + } + return ret; + } + + template + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, + typename std::enable_if_t* = nullptr + ) const { + CUDA_KERNEL_ASSERT(!final_output); + return (out_scalar_t)value; + } + + // This function should never be called -- + // it's the version of `accumulate_in_output` + // when accumulation in the output is not possible. + template + C10_DEVICE std::array accumulate_in_output( + std::array, + std::array, + typename std::enable_if_t* = nullptr + ) const { + CUDA_KERNEL_ASSERT(false); + return {arg_t{}}; + } + + // This function should never be called -- + // it's the version of `get_accumulated_output` + // when accumulation in the output is not possible. + template + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, + typename std::enable_if_t* = nullptr + ) const { + CUDA_KERNEL_ASSERT(false); + return *out; + } + + template + C10_DEVICE void set_results(const T x, const index_t base_offset) const { + CUDA_KERNEL_ASSERT(noutputs == 1); + auto res = (out_scalar_t*)((char*)dst[0] + base_offset); + *res = x; + } + + //Currently implemented for max of two outputs + template + C10_DEVICE void set_results(const thrust::pair x, const index_t base_offset) const { + if (noutputs >= 1) { + auto res0 = (T1*)((char*)dst[0] + base_offset); + *res0 = x.first; + } + if (noutputs >= 2) { + // base offset is computed assuming element size being sizeof(T1), so we need to make a + // correction to obtain the correct base offset + auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2)); + *res1 = x.second; + } + } + + template + C10_DEVICE void set_results_to_output(std::array value, std::array base_offset) const { + CUDA_KERNEL_ASSERT(final_output); + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + set_results(ops.project(value[i]), base_offset[i]); + } + } + + template + C10_DEVICE std::array global_reduce(std::array value, std::array *acc, char* shared_memory) const { + using arg_vec_t = std::array; + using out_ptr_vec_t = std::array; + using offset_vec_t = std::array; + + arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf; + index_t output_idx = config.output_idx(); + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + bool should_store = config.should_store(output_idx); + if (should_store) { + index_t offset = config.staging_memory_offset(blockIdx.y); +#ifndef USE_ROCM + reduce_buffer[offset] = value; +#else // [CMTSTRS] + // In architectures with split caches, global fences are costly. + // Here we preempt need for fences by committing stores to global memory. + cmtdStore(&reduce_buffer[offset], value); +#endif + } + +#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS] + __threadfence(); // make sure writes are globally visible +#endif + __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done + bool is_last_block_done = mark_block_finished(); + + if (is_last_block_done) { +#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS] + __threadfence(); // complete the acquire pattern after atomic +#endif + for (auto &v : value) { + v = ident; + } + if (config.should_block_x_reduce()) { + index_t input_offset = threadIdx.x + threadIdx.y * blockDim.x; + index_t step = blockDim.x * blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + index_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[i]); + } + } + } else { + index_t input_offset = threadIdx.y; + index_t step = blockDim.y; +#ifdef USE_ROCM // Prefetch loads to better hide their latency + #define PRFCH 4 + for (; input_offset < config.ctas_per_output; input_offset += step*PRFCH) { + arg_vec_t next[PRFCH]; + #pragma unroll + for (int u = 0; (u < PRFCH) && (input_offset + u*step < config.ctas_per_output); u++) { + index_t idx = config.staging_memory_offset(input_offset + u*step); + next[u] = reduce_buffer[idx]; + } + for (int u = 0; (u < PRFCH) && (input_offset + u*step < config.ctas_per_output); u++) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[u][i]); + } + } + } +#else + for (; input_offset < config.ctas_per_output; input_offset += step) { + index_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[i]); + } + } +#endif + } + value = block_y_reduce(value, shared_memory); + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (should_store) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + return value; + } +}; + +template +static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) { + dim3 block = config.block(); + dim3 grid = config.grid(); + + auto stream = at::cuda::getCurrentCUDAStream(); + int shared_memory = config.shared_memory_size(); + + switch(config.output_vec_size) { + case 4: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + case 2: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + default: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +inline void launch_jitted_reduce_kernel( + std::mutex &jiterator_mutex, + std::array &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, + int vt0, const ReduceConfig& config, const void *reduction) { + dim3 block = config.block(); + dim3 grid = config.grid(); + + int shared_memory = config.shared_memory_size(); + at::cuda::jit::NvrtcFunction* fn_ptr; + switch(config.output_vec_size) { + case 4: + fn_ptr = &fn_cache[0]; + break; + case 2: + fn_ptr = &fn_cache[1]; + break; + default: + fn_ptr = &fn_cache[2]; + } + if (!fn_ptr->function) { + int max_threads_codegen = + max_reduce_threads(desc.f_inputs_type) / config.output_vec_size; + auto code = at::cuda::jit::generate_reduction_code( + desc, vt0, true, false, config.output_vec_size, max_threads_codegen); + + *fn_ptr = at::cuda::jit::jit_pwise_function(code, "reduction_" + desc.name); + } + constexpr int kernel_args = 1; + const void* args[kernel_args]; + args[0] = reduction; + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, block, shared_memory); +} + + +class AccumulationBuffer { + public: + AccumulationBuffer() {} + + AccumulationBuffer(size_t acc_t_size, size_t out_t_size, char* out_ptr, int64_t size) { + out_ptr_ = (char*)out_ptr; + if (out_t_size >= acc_t_size) { + // reusing output buffer for accumulation. + acc_ptr_ = (char*)out_ptr; + numerator_ = 1; + denominator_ = 1; + } else { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer_ = allocator.allocate(size); + acc_ptr_ = (char*)buffer_.get(); + numerator_ = acc_t_size; + denominator_ = out_t_size; + reduce_fraction(numerator_, denominator_); + } + } + + char* get_acc_slice(char* out_ptr) { + if (acc_ptr_ == nullptr) { + return nullptr; + } + return acc_ptr_ + ((out_ptr - out_ptr_) * numerator_ / denominator_); + } + + private: + char* acc_ptr_ = nullptr; + char* out_ptr_ = nullptr; + size_t numerator_; + size_t denominator_; + at::DataPtr buffer_; +}; + +template +int get_output_vec_size(const TensorIterator &iter) { + int vec_size = 4; + auto update_vec_size = [&vec_size](uint64_t n) { + while(n % vec_size != 0) { + vec_size /= 2; + } + }; + + uint64_t base_address = reinterpret_cast(iter.data_ptr(iter.noutputs())) / sizeof(scalar_t); + update_vec_size(base_address); + + const int output_index = iter.num_reduce_dims(); + update_vec_size(iter.shape()[output_index]); + + int j = 0; + for(auto i : iter.strides(iter.noutputs())) { + if (j != output_index) { + update_vec_size(i / sizeof(scalar_t)); + } + j++; + } + return vec_size; +} + +template +ReduceConfig setReduceConfig(const TensorIterator& iter){ + // Start by assuming that each thread handles a single output and all + // the inputs for that output. + int64_t num_outputs = iter.num_output_elements(); + int64_t inputs_per_output = iter.numel() / num_outputs; + int input_index = iter.ntensors() - 1; + + auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output); + + int64_t dim0; + int64_t dim1; + int64_t fastest_moving_stride; + bool reduction_on_fastest_striding_dimension; + + if (iter.ndim() > 0) { + // Adjust block size to map block width to fastest changing dimension of input + // tensor. This grants the best possible memory accessing pattern, given that + // for non-contiguous tensor with space in between, we cannot have perfect + // memory coalescing. + reduction_on_fastest_striding_dimension = + (iter.num_reduce_dims() == iter.ndim()) || + (iter.strides(/*arg=*/input_index)[0] < + iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]); + // Notice that dim0 & dim1 does NOT guarantee any launch configuration here! + // dim0 & dim1 are more like the upper bound of the block dimension. The + // actual launch config and reduction scheme is determined by setting values + // to `config.input_mult` and `config.output_mult`. + // We try to max out dim1 so that we have enough threads per CTA to deliver + // performance for larger problem size. + if (reduction_on_fastest_striding_dimension) { + // Map block.x to the fastest reducing dimension. It implies: + // 1. block_x_reduce is required. + // 2. block.y now max out to num_outputs. + dim0 = inputs_per_output; + dim1 = num_outputs; + fastest_moving_stride = iter.strides(/*arg=*/input_index)[0]; + } else { + // Map block.x to the fastest non reducing dimension. It implies: + // 1. block_x_reduce is turned off. + // 2. block.y now max out to inputs_per_output. + dim0 = num_outputs; + dim1 = inputs_per_output; + fastest_moving_stride = iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]; + } + } else { + reduction_on_fastest_striding_dimension = true; + fastest_moving_stride = sizeof(scalar_t); + dim0 = 1; + dim1 = 1; + } + + // We do vectorization to gain better memory access, there are two cases which we call + // "vectorize along input" and "vectorize along output". Note that the "input/output" + // here does not mean we are vectorizing load/store instructions. We always only vectorize + // load instructions. + // + // Case 1: "vectorize along input" + // This case happens when we are reducing along fastest moving dimension. In such case, threads + // with the same threadIdx.y works on the same reduction cooperatively and will produce results + // for the same output. In such case, values in each loaded vector always correspond to the same output. + // + // Case 2: "vectorize along output" + // This case happens when the fastest moving dimension is not the dimension of reduction. In such case, + // threads with different threadIdx.x are independent and will produce results for different outputs. + // In such case, values in each loaded vector always correspond to different outputs. + if (fastest_moving_stride == sizeof(scalar_t)) { + if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) { + // Case 1: "vectorize along input" + // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case, + // we should avoid vectorization. + config.vectorize_input = true; + dim0 /= input_vec_size; + } else if (!reduction_on_fastest_striding_dimension) { + // Case 2: "vectorize along output" + config.output_vec_size = get_output_vec_size(iter); + dim0 /= config.output_vec_size; + } + } + + // Adjust block_width and block_height + config.set_block_dimension(dim0, dim1); + + int block_width = config.block_width; + int block_height = config.block_height; + + if (iter.ndim() == 0 || reduction_on_fastest_striding_dimension) { + // Split the input across lanes if the input is contiguous in the reduced + // dimension. This will require reduction between threads using warp + // shuffle instructions and shared memory (if block_width > warpSize). + config.input_mult[0] = config.split_input(block_width); + } else { + // Otherwise split the output across lanes in a warp. + config.output_mult[0] = config.split_output(block_width); + } + + constexpr int min_values_per_thread = 16; + constexpr int max_values_per_thread = 256; + + const int warp_split_threshold = + std::min(block_height * 16, max_values_per_thread); + bool split_across_warps = config.values_per_thread() >= warp_split_threshold; + const int num_mp = + at::cuda::getCurrentDeviceProperties()->multiProcessorCount; +#ifdef USE_ROCM + bool force_splitting_output = iter.ndim() == 2 && + reduction_on_fastest_striding_dimension && + config.values_per_thread() < 1024 && num_mp < 100; + split_across_warps = !force_splitting_output && split_across_warps; +#endif + + if (split_across_warps) { + // Divide the input across warps in a thread-block, if that leaves at least + // 16 elements to be summed by each thread. This will require inter-warp + // reduction using shared memory. + config.input_mult[1] = config.split_input(block_height); + } else { + // Otherwise, each warp handles a separate output. + config.output_mult[1] = config.split_output(block_height); + } + + int max_threads_per_mp = + at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor; +#ifdef USE_ROCM + // If the grid consists of a single threadblock, do not change the max threads per + // MP value. This will increase the parallelism across the y dimension of the grid. + bool uses_a_single_block = config.grid().x == config.grid().y == config.grid().z == 1; + + if (!uses_a_single_block) { + // Control the number of threadblocks by adjusting the maximum number of + // threads per multi-processor. These numbers better reflect the maximum + // theoretical achievable threads per MP for the reduction operation. + if (iter.ndim() == 1 || iter.ndim() == 3) + max_threads_per_mp = 512; + else if (iter.ndim() == 2) + max_threads_per_mp = 256; + } +#endif + const int blocks_per_sm = max_threads_per_mp / config.num_threads; + const int target_grid_size = num_mp * blocks_per_sm; + int grid = config.grid().x; + if (config.input_mult[1] != 0 && config.values_per_thread() >= max_values_per_thread && grid <= target_grid_size) { + // Divide the input across thread-blocks if the amount of work per-thread + // is large enough and the size of the output is small enough. This will + // require a reduction using global memory. + // If we decide to split input across blocks, as long as we can get enough + // number of blocks (`target_grid_size`) to balance SM, we should still + // make the number of values per thread large for best performance. + int ctas_per_output1 = div_up(target_grid_size, grid); + int ctas_per_output2 = div_up(config.values_per_thread(), min_values_per_thread); + int ctas_per_output3 = div_up(config.values_per_thread(), max_values_per_thread); + // We want the minimum of ctas_per_output1 and ctas_per_output2, so that each thread can have + // a large number of values to deal with. But we don't want values_per_thread to be larger than + // max_values_per_thread + config.ctas_per_output = std::max(std::min(ctas_per_output1, ctas_per_output2), ctas_per_output3); +#ifdef USE_ROCM + // In cases where a number of threadblocks along the y direction of the grid + // is needed then make sure they are reduced to the number of MPs. For + // smaller sizes, use half the number of MPs. For smaller sizes than half + // the number of MPs use the original value unless the value is less than 16 + // blocks in which case it is more profitable to use just 1 block. + if (config.ctas_per_output > num_mp) + if (num_mp < 128) + config.ctas_per_output = + num_mp * (config.ctas_per_output > 512 ? 4 : 2); + else + config.ctas_per_output = num_mp; + else if (config.ctas_per_output > div_up(num_mp, 2)) + config.ctas_per_output = div_up(num_mp, 2); + else if (config.ctas_per_output < 16) + config.ctas_per_output = 1; + bool is_channel_last = iter.tensor_base(1).is_contiguous(at::MemoryFormat::ChannelsLast); + if (iter.ndim() == 3 && !reduction_on_fastest_striding_dimension && !is_channel_last) { + config.ctas_per_output = 4; + int vpt = config.values_per_thread(); + // Capping the number of values per thread to 2048 for now + // based on known use cases. + while (vpt >= 2048) { + config.ctas_per_output *= 2; + // Computes the new values per thread without side effects + vpt = config.mock_values_per_thread(config.ctas_per_output); + } + } +#endif + if (config.ctas_per_output > 1) { + config.input_mult[2] = config.split_input(config.ctas_per_output); + } + } + return config; +}; + +template +inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + using traits = function_traits; + using arg_t = typename traits::template arg<0>::type; + // at::Half/at::ComplexHalf overflows easily as it's range is very small. + // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_half_or_chalf = + (std::is_same_v && + std::is_same_v) || + (std::is_same_v, scalar_t> && + std::is_same_v, out_scalar_t>); + // at::BFloat16 has lower precision and can lead to rounding errors. + // So when scalar_t and out_scalar_t are at::BFloat16, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_bfloat16 = + (std::is_same_v && + std::is_same_v); + static constexpr bool can_accumulate_in_output = + std::is_convertible_v && + !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t), + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(arg_t))); + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + gpu_reduce_kernel(sub_iter, ops, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + std::optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = std::nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + + ReduceConfig config = setReduceConfig(iter); + at::DataPtr buffer; + at::DataPtr semaphores; + if (config.should_global_reduce()) { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer = allocator.allocate(config.global_memory_size()); + semaphores = allocator.allocate(config.semaphore_size()); + + auto stream = at::cuda::getCurrentCUDAStream(); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream)); + } + + AT_ASSERT(can_use_32bit_indexing); + auto output_calc = make_output_calculator(iter); + auto input_calc = make_input_calculator(iter); + auto reduce = ReduceOp( + ops, + config, + input_calc, + output_calc, + in_data, + out_data, + out_data_extra, + acc_data, + buffer.get(), + (int*)semaphores.get(), + ident, + noutputs, + base_idx); + reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); + + launch_reduce_kernel::MAX_NUM_THREADS>(config, reduce); +} + +//TODO this is 100 lines of almost-copy-paste, because we have to have different template args for this function +//try unifying with gpu_reduce_kernel +template +inline void jitted_gpu_reduce_kernel(TensorIterator& iter, const std::string& func, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + //TODO - this will be different for more complicated reductions, but for now reductions using + //func_wrapper all have arg_t = opmath + using arg_t = at::opmath_type; + // at::Half/at::ComplexHalf overflows easily as it's range is very small. + // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_half_or_chalf = + (std::is_same_v && + std::is_same_v ) || + (std::is_same_v, scalar_t> && + std::is_same_v, out_scalar_t>); + // at::BFloat16 has lower precision and can lead to rounding errors. + // So when scalar_t and out_scalar_t are at::BFloat16, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_bfloat16 = + (std::is_same_v && + std::is_same_v); + static constexpr bool can_accumulate_in_output = + std::is_convertible_v && + !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(out_scalar_t), //TODO + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(out_scalar_t))); //TODO + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + jitted_gpu_reduce_kernel(sub_iter, func, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + //TODO - for now we support a single input, we may be able to relax this constraint + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + std::optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = std::nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + + ReduceConfig config = setReduceConfig(iter); + + at::DataPtr buffer; + at::DataPtr semaphores; + if (config.should_global_reduce()) { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer = allocator.allocate(config.global_memory_size()); + semaphores = allocator.allocate(config.semaphore_size()); + + auto stream = at::cuda::getCurrentCUDAStream(); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream)); + } + + AT_ASSERT(can_use_32bit_indexing); + auto output_calc = make_output_calculator(iter); + auto input_calc = make_input_calculator(iter); + auto reduce = ReduceJitOp( + config, + input_calc, + output_calc, + in_data, + out_data, + out_data_extra, + acc_data, + buffer.get(), + (int*)semaphores.get(), + ident, + noutputs, + base_idx); + reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); + + constexpr int nInputs = 1; + constexpr int nOutputs = 1; + static auto desc = at::cuda::jit::make_kernel_descriptor< + out_scalar_t, scalar_t>(name, func, nInputs, nOutputs); + + static std::mutex jiterator_mutex; + static std::vector> fn_cache(c10::cuda::device_count()); + auto &cache = fn_cache[iter.device().index()]; + + launch_jitted_reduce_kernel( + jiterator_mutex, cache, desc, vt0, config, &reduce); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ReduceOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ReduceOps.h new file mode 100644 index 0000000000000000000000000000000000000000..16394d6d931a4468b129ff16eef2094ced4ece93 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ReduceOps.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) + +namespace at { +struct TensorIterator; +} + +namespace c10 { +class Scalar; +} + +namespace at::native { + +void norm_launch_kernel(TensorIterator &iter, double val); +void min_launch_kernel(TensorIterator &iter); +void max_launch_kernel(TensorIterator &iter); +void aminmax_launch_kernel(TensorIterator &iter); +void min_all_launch_kernel(TensorIterator &iter); +void max_all_launch_kernel(TensorIterator &iter); +void aminmax_allreduce_launch_kernel(TensorIterator &iter); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Resize.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Resize.h new file mode 100644 index 0000000000000000000000000000000000000000..8c0155f888a6943fb4267d639fd57baf5887c0a7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Resize.h @@ -0,0 +1,58 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include + +namespace at::native { + +TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes); + +static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_bytes) { + // It does not make sense to try to resize a storage + // to hold 0 elements, and this can break + // if storage_offset is positive but + // new_size is 0, so just bail in that case + // (same comment is in Resize.h) + if (self->numel() == 0) { + return; + } + + const Storage &storage = self->unsafe_storage(); + TORCH_CHECK(storage, "Tensor: invalid null storage"); + if (new_size_bytes > storage.nbytes()) { + resize_bytes_cuda(storage.unsafeGetStorageImpl(), new_size_bytes); + } +} + +inline TensorImpl* resize_impl_cuda_( + TensorImpl* self, + IntArrayRef size, + at::OptionalIntArrayRef stride) { + if (self->sizes() == size && (!stride || self->strides() == stride)) { + return self; + } + const auto itemsize = self->dtype().itemsize(); + const auto storage_offset = self->storage_offset(); + size_t storage_size = 1; + if (stride) { + self->set_sizes_and_strides(size, *stride); + storage_size = at::detail::computeStorageNbytes( + size, *stride, itemsize, storage_offset); + } else { + self->set_sizes_contiguous(size); + storage_size = at::detail::computeStorageNbytesContiguous( + size, itemsize, storage_offset); + } + maybe_resize_storage_cuda(self, storage_size); + + return self; +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/RowwiseScaledMM.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/RowwiseScaledMM.h new file mode 100644 index 0000000000000000000000000000000000000000..613bb0705dfd668e5eeb3a94a52fe4f0142551a6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/RowwiseScaledMM.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::cuda::detail { +TORCH_API void f8f8bf16_rowwise( + at::Tensor XQ, // FP8 + at::Tensor WQ, // FP8 + at::Tensor x_scale, // FP32 + at::Tensor w_scale, // FP32 + std::optional bias, // BF16 + bool use_fast_accum, + at::Tensor& out); +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScaledGroupMM.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScaledGroupMM.h new file mode 100644 index 0000000000000000000000000000000000000000..d246ce955364ba361cac7f48d64ff94bce23c43e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScaledGroupMM.h @@ -0,0 +1,20 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::cuda::detail { +TORCH_API void f8f8bf16_grouped_mm( + at::Tensor mat_a, // FP8 + at::Tensor mat_b, // FP8 + at::Tensor scale_a, // FP32 + at::Tensor scale_b, // FP32 + std::optional offs, + std::optional bias, // BF16 + bool use_fast_accum, + at::Tensor& out); +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanKernels.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanKernels.h new file mode 100644 index 0000000000000000000000000000000000000000..9584f4710ea0657269da74889322589df83d54c7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanKernels.h @@ -0,0 +1,23 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class TensorBase; + +namespace native { + +// NOTE: these functions require output tensors to be contiguous +void launch_cummax_cuda_kernel(const TensorBase& self, const TensorBase& values, + const TensorBase& indices, int64_t dim); +void launch_cummin_cuda_kernel(const TensorBase& self, const TensorBase& values, + const TensorBase& indices, int64_t dim); +void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); +void launch_cumsum_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); +void launch_cumprod_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim); + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4de7c7a0f7ad7f1cf2800505c35de5063cf7cefa --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh @@ -0,0 +1,485 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +#include +#include +#include + +namespace at::native { + +template +constexpr inline integer ceil_div(integer n, integer m) { + return (n + m - 1) / m; +} + +template +constexpr inline integer get_log_num_threads_x_inner_scan(integer num_rows, integer row_size) { + integer log_num_threads_x = 0; + integer log_num_threads_y = 0; + while (((integer)1 << log_num_threads_x) < row_size) { + ++log_num_threads_x; + } + while (((integer)1 << log_num_threads_y) < num_rows) { + ++log_num_threads_y; + } + // we want to keep the ratio between the x-threads and y-threads about the same as + // the ratio between the row_size and num_rows, but the total number of threads in + // a block should be about 512 + integer diff = log_num_threads_x - log_num_threads_y; + // 9 is from log2(512) + log_num_threads_x = ((integer)9 + diff) / (integer)2; + // I found that in having larger log_num_threads_x can give significant speed up in some cases, + // but detrimental in another case, so just keep the lower bound to be log2(16) == 4 to make it + // similar to the previous implementation + // Keeping the upper bound to be log2(512) == 9 as the maximum number of threads in a block. + log_num_threads_x = std::min(std::max((integer)4, log_num_threads_x), (integer)9); + return log_num_threads_x; +} + +template +__device__ void binary_op_update(const scalar_t lhs, scalar_t& rhs, const idx_t lhs_idx, idx_t& rhs_idx, BinaryOperation binary_op) { + if(!at::_isnan(rhs) && (at::_isnan(lhs) || !binary_op(rhs, lhs))) { + rhs = lhs; + rhs_idx = lhs_idx; + } +} +/* Perform an inclusive scan along the innermost dimension of a tensor. + * + * - num_rows is the size of the flattened outer dimensions; + * - row_size is the size of the innermost dimension; + * + * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is + * considered as having 'num_rows' rows of size 'row_size'. + * Each thread block processes one or more sets of contiguous rows (processing multiple rows + * per thread block is quicker than processing a single row, especially for short rows). + */ +template +__global__ void tensor_kernel_scan_innermost_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_, + int num_rows, int row_size, + const uint32_t num_threads, const uint32_t log_num_threads_x, + scalar_t init, BinaryFunction binary_op) { + // dynamic memory allocation for vbuf and ibuf + alignas(sizeof(double)) extern __shared__ char buf[]; + scalar_t* vbuf = reinterpret_cast(buf); // the size is num_threads * 2 + int64_t* ibuf = reinterpret_cast(vbuf + num_threads * 2); + const uint32_t num_threads_x = 1 << log_num_threads_x; + scalar_t* row_buf = vbuf + 2 * num_threads_x * threadIdx.y; + int64_t* row_idx_buf = ibuf + 2 * num_threads_x * threadIdx.y; + + for (int block_row = blockIdx.x * blockDim.y; + block_row < num_rows; + block_row += blockDim.y * gridDim.x) { + int row = block_row + threadIdx.y; + const scalar_t *row_self = self_ + row * row_size; + scalar_t *row_values = values_ + row * row_size; + int64_t *row_indices = indices_ + row * row_size; + scalar_t block_total = init; + int64_t block_idx_final = 0; + const bool row_exists = row < num_rows; + // Perform scan on one block at a time, keeping track of the total value of + // all blocks processed so far. + for (int block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) { + // Load data into shared memory (two values per thread). + int col1 = block_col + threadIdx.x; + int col2 = block_col + num_threads_x + threadIdx.x; + if (row_exists) { + if (col1 < row_size) { + row_buf[threadIdx.x] = c10::load(&row_self[col1]); + row_idx_buf[threadIdx.x] = col1; + } else { + row_buf[threadIdx.x] = init; + // No need to set the index here as the value in init will never be selected + } + + if (col2 < row_size) { + row_buf[num_threads_x + threadIdx.x] = c10::load(&row_self[col2]); + row_idx_buf[num_threads_x + threadIdx.x] = col2; + } else { + row_buf[num_threads_x + threadIdx.x] = init; + // No need to set the index here as the value in init will never be selected + } + + // Add the total value of all previous blocks to the first value of this block. + if (threadIdx.x == 0) { + binary_op_update(block_total, row_buf[0], block_idx_final, row_idx_buf[0], binary_op); + } + } + __syncthreads(); + + // Parallel reduction with Sklansky method. The diagram can be seen on this paper: + // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + for (uint32_t s = 1; s <= num_threads_x; s <<= 1) { + if (row_exists) { + uint32_t a = (threadIdx.x / s) * (2 * s) + s; + uint32_t ti = a + (threadIdx.x % s); + uint32_t si = a - 1; + binary_op_update(row_buf[si], row_buf[ti], row_idx_buf[si], row_idx_buf[ti], binary_op); + } + __syncthreads(); + } + + // Write back to output. + if (row_exists) { + if (col1 < row_size){ + row_values[col1] = row_buf[threadIdx.x]; + row_indices[col1] = row_idx_buf[threadIdx.x]; + } + if (col2 < row_size) { + row_values[col2] = row_buf[num_threads_x + threadIdx.x]; + row_indices[col2] = row_idx_buf[num_threads_x + threadIdx.x]; + } + } + block_total = row_buf[2 * num_threads_x - 1]; + block_idx_final = row_idx_buf[2 * num_threads_x - 1]; + __syncthreads(); + } + } +} + +/* Perform an inclusive scan along an outer dimension of a tensor. + * + * - num_orows is the size of the flattened outer dimensions; + * - num_irows is the size of the flattened inner dimensions; + * - row_size is the size of the dimension along which to compute the variance; + * + * The dimensions to the outside and inside of the specified dimension are considered as flattened. + * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened + * outer dimensions, which contains several "inner rows"). + * Each thread processes a single inner row at a time. + */ +template +__global__ void tensor_kernel_scan_outer_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_, + const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, scalar_t init, BinaryFunction binary_op) { + for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { + const scalar_t *self = self_ + orow * row_size * num_irows + irow; + scalar_t *values = values_ + orow * row_size * num_irows + irow; + int64_t *indices = indices_ + orow * row_size * num_irows + irow; + scalar_t out = init; + int64_t out_idx = 0; + + for (auto col = decltype(row_size){0}; col < row_size; ++col) { + const auto val = c10::load(self); + if(at::_isnan(val) || (!at::_isnan(out) && binary_op(val, out))) { + out = val; + out_idx = col; + } + *values = out; + *indices = out_idx; + self += num_irows; + values += num_irows; + indices += num_irows; + } + } + } +} + +inline void check_fits_in_unsigned(int64_t val, const char* name) { + constexpr auto umax = std::numeric_limits::max(); + TORCH_CHECK( + val >= 0 && val <= umax, name, " must fit in a 32-bit uint32_t value"); +} + + +template +__host__ void scan_outer_dim_with_indices( + const TensorBase& self, const TensorBase& values, const TensorBase& indices, + int dim, scalar_t init, BinaryFunction binary_op) { + int64_t row_size = self.size(dim); + auto sizes = self.sizes(); + + // Treat all outer dimensions (i.e. dim_ < dim) as one. + const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim); + + // Treat all inner dimensions (i.e. dim > dimension) as one. + const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end()); + //for performance reasons, cuda kernels use uint32_t for loops over irows, orows and row, + //make sure that input is not bigger than supported by uint32_t + check_fits_in_unsigned(num_irows, "num_irows"); + check_fits_in_unsigned(num_orows, "num_orows"); + check_fits_in_unsigned(row_size, "row_size"); + + + dim3 threads(std::min(512, int(num_irows))); + int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x}))); + tensor_kernel_scan_outer_dim_with_indices<<>>( + self.const_data_ptr(), values.mutable_data_ptr(), indices.mutable_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +__host__ void scan_innermost_dim_with_indices( + const TensorBase& self, const TensorBase& values, const TensorBase& indices, + scalar_t init, BinaryFunction binary_op) { + int ndim = self.dim(); + // Treat all outer dimensions as a single dimension. + int row_size = self.size(ndim - 1); + int num_rows = self.numel() / row_size; + + // assuming max_num_threads per block is 512 + const uint32_t num_threads = 512; + const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan(num_rows, row_size); + const uint32_t num_threads_x = (1 << log_num_threads_x); + const uint32_t num_threads_y = num_threads / num_threads_x; + dim3 threads(num_threads_x, num_threads_y); + dim3 grid(std::min(at::cuda::getCurrentDeviceProperties()->maxGridSize[0], ceil_div(num_rows, int(threads.y)))); + + const uint32_t mem_size = 2 * num_threads * (sizeof(scalar_t) + sizeof(int64_t)); + tensor_kernel_scan_innermost_dim_with_indices<<>>( + self.const_data_ptr(), values.mutable_data_ptr(), indices.mutable_data_ptr(), + num_rows, row_size, num_threads, log_num_threads_x, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void scan_dim_with_indices(const TensorBase& self, const TensorBase& values, const TensorBase& indices, //int64_t dim) { + int64_t dim, scalar_t init, BinaryFunction binary_op) { + int ndim = self.dim(); + auto self_ = self.expect_contiguous(); + TORCH_INTERNAL_ASSERT(values.is_contiguous() && indices.is_contiguous()); + if (dim == ndim - 1) { + scan_innermost_dim_with_indices(*self_, values, indices, init, binary_op); + } else { + scan_outer_dim_with_indices(*self_, values, indices, dim, init, binary_op); + } +} + +// TODO: The implementation of `tensor_kernel_scan_outer_dim` and +// `tensor_kernel_scan_innermost_dim` is similar to +// `tensor_kernel_scan_outer_dim_with_indices` +// `tensor_kernel_scan_outer_dim_with_indices` and should be refactored to +// remove the duplication. + +/* Perform an inclusive scan along an outer dimension of a tensor. + * + * - num_orows is the size of the flattened outer dimensions; + * - num_irows is the size of the flattened inner dimensions; + * - row_size is the size of the dimension along which to scan; + * + * The dimensions to the outside and inside of the specified dimension are considered as flattened. + * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened + * outer dimensions, which contains several "inner rows"). + * Each thread processes a single inner row at a time. + */ +template +__global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, const scalar_t *src_, + const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, + const scalar_t init, BinaryOp binary_op) +{ + for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { + const scalar_t *src = src_ + static_cast(orow) * row_size * num_irows + irow; + scalar_t *tgt = tgt_ + (index_t) orow * row_size * num_irows + irow; + scalar_t acc = init; + + for (uint32_t col = 0; col < row_size; ++col) { + acc = binary_op(acc, c10::load(src)); + *tgt = acc; + + src += num_irows; + tgt += num_irows; + } + } + } +} + +/* Perform an inclusive scan along the innermost dimension of a tensor. + * + * - num_rows is the size of the flattened outer dimensions; + * - row_size is the size of the innermost dimension; + * + * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is + * considered as having 'num_rows' rows of size 'row_size'. + * Each thread block processes one or more sets of contiguous rows (processing multiple rows + * per thread block is quicker than processing a single row, especially for short rows). + */ +template +__device__ void tensor_kernel_scan_innermost_dim_impl(T* row_buf, T *tgt_, const T *src_, + const uint32_t num_rows, const uint32_t row_size, + const uint32_t log_num_threads_x, + T init, BinaryFunction binary_op){ + const index_t num_threads_x = 1 << log_num_threads_x; + for (index_t block_row = blockIdx.x * (index_t) blockDim.y; + block_row < num_rows; + block_row += blockDim.y * gridDim.x) { + index_t row = block_row + (index_t) threadIdx.y; + T block_total = init; + + const T *row_src = src_ + row * row_size; + T *row_tgt = tgt_ + row * row_size; + const bool row_exists = row < num_rows; + + // Perform scan on one block at a time, keeping track of the total value of + // all blocks processed so far. + for (index_t block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) { + // Load data into shared memory (two values per thread). + index_t col1 = block_col + (index_t) threadIdx.x; + index_t col2 = block_col + num_threads_x + (index_t) threadIdx.x; + if (row_exists) { + if (col1 < row_size) { + row_buf[threadIdx.x] = row_src[col1]; + } else { + row_buf[threadIdx.x] = init; + } + + if (col2 < row_size) { + row_buf[num_threads_x + threadIdx.x] = row_src[col2]; + } else { + row_buf[num_threads_x + threadIdx.x] = init; + } + + // Add the total value of all previous blocks to the first value of this block. + if (threadIdx.x == 0) { + row_buf[0] = binary_op(row_buf[0], block_total); + } + } + __syncthreads(); + + // Parallel reduction with Sklansky method. The diagram can be seen on this paper: + // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + for (int m = 0; m <= log_num_threads_x; ++m) { + if (row_exists) { + index_t s = 1 << m; // s = 2 ^ m + auto a = static_cast((threadIdx.x >> m) << (m + 1)) | s; // a = (threadIdx.x / s) * (2 * s) + s + index_t ti = a + (threadIdx.x % s); + index_t si = a - 1; + row_buf[ti] = binary_op(row_buf[ti], row_buf[si]); + } + __syncthreads(); + } + + // Write back to output. + if (row_exists) { + if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x]; + if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x]; + } + block_total = row_buf[2 * num_threads_x - 1]; + __syncthreads(); + } + } +} + +template < + typename T, + class BinaryFunction> +__global__ void tensor_kernel_scan_innermost_dim( + T* tgt_, + const T* src_, + const uint32_t num_rows, + const uint32_t row_size, + const uint32_t log_num_threads_x, + T init, + BinaryFunction binary_op) { + alignas(sizeof(double)) extern __shared__ char sbuf[]; + T* sbuf2 = reinterpret_cast(sbuf); + const uint32_t num_threads_x = 1 << log_num_threads_x; + T* row_buf = reinterpret_cast(sbuf2 + num_threads_x * 2 * threadIdx.y); + if (num_rows * (size_t) row_size <= UINT_MAX) { + tensor_kernel_scan_innermost_dim_impl( + row_buf, tgt_, src_, num_rows, row_size, log_num_threads_x, init, binary_op); + } else { + tensor_kernel_scan_innermost_dim_impl( + row_buf, tgt_, src_, num_rows, row_size, log_num_threads_x, init, binary_op); + } +} + + +template +__host__ void scan_outer_dim(const TensorBase& self, const TensorBase& result, + int dim, scalar_t init, BinaryFunction binary_op) { + const int64_t row_size = self.size(dim); + auto sizes = self.sizes(); + + // Treat all outer dimensions (i.e. dim_ < dim) as one. + const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim); + + // Treat all inner dimensions (i.e. dim > dimension) as one. + const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end()); + + dim3 threads(std::min(512, int(num_irows))); + int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x}))); + + check_fits_in_unsigned(num_irows, "num_irows"); + check_fits_in_unsigned(num_orows, "num_orows"); + check_fits_in_unsigned(row_size, "row_size"); + if (static_cast(num_irows) * num_orows * row_size <= UINT_MAX) { + tensor_kernel_scan_outer_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + } else { + tensor_kernel_scan_outer_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + } + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void scan_innermost_dim(const TensorBase& self, const TensorBase& result, + scalar_t init, BinaryFunction binary_op) { + int64_t ndim = self.dim(); + // Treat all outer dimensions as a single dimension. + int64_t row_size = self.size(ndim - 1); + int64_t num_rows = self.numel() / row_size; + + // assuming max_num_threads per block is 512 + const uint32_t num_threads = 512; + const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan(num_rows, row_size); + const uint32_t num_threads_x = (1 << log_num_threads_x); + const uint32_t num_threads_y = num_threads / num_threads_x; + dim3 threads(num_threads_x, num_threads_y); + int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; + dim3 grid(std::min(maxGridDim, ceil_div(num_rows, int64_t{threads.y}))); + + check_fits_in_unsigned(num_rows, "Number of rows (self.numel()/self.size(self.dim()-1))"); + check_fits_in_unsigned(row_size, "row_size"); + + tensor_kernel_scan_innermost_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_rows, row_size, log_num_threads_x, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void scan_dim(const TensorBase& self, const TensorBase& result, + int64_t dim, scalar_t init, BinaryFunction binary_op) { + int ndim = self.dim(); + auto self_ = self.expect_contiguous(); + TORCH_INTERNAL_ASSERT(result.is_contiguous()); + + if (self.numel() == self.size(dim)) { + if constexpr (std::is_same_v>) { + if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms()) && (self.is_floating_point() || self.is_complex())) { +#if defined(CUDA_VERSION) || defined(USE_ROCM) + cuda::cub::inclusive_deterministic_scan(self_->const_data_ptr(), result.mutable_data_ptr(), binary_op, self.numel()); +#else + globalContext().alertNotDeterministic("cumsum_cuda_kernel"); + cuda::cub::inclusive_scan(self_->const_data_ptr(), result.mutable_data_ptr(), binary_op, self.numel()); +#endif + } else { + cuda::cub::inclusive_scan(self_->const_data_ptr(), result.mutable_data_ptr(), binary_op, self.numel()); + } + } else { + cuda::cub::inclusive_scan(self_->const_data_ptr(), result.mutable_data_ptr(), binary_op, self.numel()); + } + } else if (dim == ndim - 1) { + scan_innermost_dim(*self_, result, init, binary_op); + } else { + scan_outer_dim(*self_, result, dim, init, binary_op); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Sort.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Sort.h new file mode 100644 index 0000000000000000000000000000000000000000..bcee7d13a3ab300226ab07737fbe9f6aa6773d8b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Sort.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + + +namespace at::native { + +inline bool should_use_small_sort(const TensorBase &self, int64_t dim) { + return self.size(dim) <= 4096; +} + +void sortKeyValueInplace( + const TensorBase &key, const TensorBase &value, int64_t dim, + bool descending, bool stable=false); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortStable.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortStable.h new file mode 100644 index 0000000000000000000000000000000000000000..1c95d04432a28e8d9c15d30e00f93a5b7a1fb4ef --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortStable.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +// Stable-sort self into values, and set indices to the +// inverse-permutation from values back to self. +// Output tensors must be pre-allocated and contiguous. +void launch_stable_sort_kernel( + const TensorBase& self, + int64_t dim, + bool descending, + const TensorBase& values, + const TensorBase& indices); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortUtils.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..f18c3c56fbddda53efba5c7b8c2065bf3c67229f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortUtils.cuh @@ -0,0 +1,348 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#include +#include +#include +#include +#include +#include +#include + +#define HAS_WARP_MERGE_SORT() (CUDA_VERSION >= 110600) + + +namespace at::native { + +template +__device__ inline void swapVars(T& t1, T& t2) { + T tmp = t1; + t1 = t2; + t2 = tmp; +} + +template +__device__ inline void bitonicSwap(K& kA, V& vA, bool& validA, + K& kB, V& vB, bool& validB, + bool dir, + const Comparator& comp) { + // Invalid entries always sort to the end + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(vA, vB); + swapVars(validA, validB); + } +}; + +template +__device__ inline void bitonicSort(K *keys, + V *values, + bool *valid, + const Comparator& comp) { +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int size = 2; size < Power2SortSize; size *= 2) { + bool flag = ((threadIdx.x & (size / 2)) != 0); + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = size / 2; stride > 0; stride /= 2) { + + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwap( + keys[pos], values[pos], valid[pos], + keys[pos + stride], values[pos + stride], valid[pos + stride], + flag, comp); + } + } + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { + + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwap( + keys[pos], values[pos], valid[pos], + keys[pos + stride], values[pos + stride], valid[pos + stride], + false, comp); + } + + __syncthreads(); + +} + +// at::cuda::detail::TensorInfo version +// Sorts (key, value) pairs (in different tensors) in-place; i.e., +// modifies the input `keys` and `values` +template +C10_LAUNCH_BOUNDS_1(block_dim_x * max_block_dim_y) +__global__ void +bitonicSortKVInPlace(at::cuda::detail::TensorInfo keys, + IndexType keySlices, + IndexType keySliceSize, + IndexType keySliceStride, + at::cuda::detail::TensorInfo values, + IndexType valueSliceStride, + Comparator comp) { + // Find the slice of the tensor that we are sorting + // NOTE: blockDim.y may be less max_block_dim_y + const IndexType blockIndex = getLinearBlockId(); + const IndexType linearIndex = blockIndex * blockDim.y + threadIdx.y; + + // If the entire block is out of bounds exit early + if (blockIndex * blockDim.y >= keySlices) { + return; + } + // It's also possible for some rows of a block to be out of bounds + // but all thread need to run for __syncthreads to work. + const bool row_valid = linearIndex < keySlices; + + constexpr int items_per_thread = 2; + constexpr int Power2SortSize = block_dim_x * items_per_thread; + + // Storage for max_block_dim_y sorts performed in parallel + __shared__ K blockSharedKeys[max_block_dim_y][Power2SortSize]; + __shared__ V blockSharedValues[max_block_dim_y][Power2SortSize]; + __shared__ bool blockSharedValid[max_block_dim_y][Power2SortSize]; + + auto sharedKeys = blockSharedKeys[threadIdx.y]; + auto sharedValues = blockSharedValues[threadIdx.y]; + auto sharedValid = blockSharedValid[threadIdx.y]; + + const IndexType keyStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, keys); + const IndexType valueStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, values); + + // Load 2 values per thread into the shared workspace + #pragma unroll + for (int k = 0; k < items_per_thread; ++k) { + auto idx = threadIdx.x + k * blockDim.x; + bool valid = row_valid && idx < keySliceSize; + + sharedKeys[idx] = valid ? + keys.data[idx * keySliceStride + keyStartOffset] : K{}; + sharedValues[idx] = valid ? + values.data[idx * valueSliceStride + valueStartOffset] : V{}; + sharedValid[idx] = valid; + } + + // Sort! + bitonicSort( + sharedKeys, sharedValues, sharedValid, comp); + + if (!row_valid) { + return; + } + + // Store outputs + #pragma unroll + for (int k = 0; k < items_per_thread; ++k) { + auto idx = threadIdx.x + k * blockDim.x; + if (idx < keySliceSize) { + keys.data[idx * keySliceStride + keyStartOffset] = sharedKeys[idx]; + values.data[idx * valueSliceStride + valueStartOffset] = sharedValues[idx]; + } + } +} + +#if HAS_WARP_MERGE_SORT() + +template +C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE * max_block_dim_y) +__global__ void +warpMergeSortKVInPlace( + at::cuda::detail::TensorInfo keys, + IndexType keySlices, + IndexType keySliceSize, + IndexType keySliceStride, + at::cuda::detail::TensorInfo values, + IndexType valueSliceStride, + Comparator comp, + K invalid_key) { + // Find the slice of the tensor that we are sorting + // NOTE: blockDim.y may be less max_block_dim_y + const IndexType blockIndex = getLinearBlockId(); + const IndexType linearIndex = blockIndex * blockDim.y + threadIdx.y; + + // If this row is out of bounds exit early + if (linearIndex >= keySlices) { + return; + } + + const IndexType keyStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, keys); + const IndexType valueStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, values); + + K *keys_slice = &keys.data[keyStartOffset]; + V *values_slice = &values.data[valueStartOffset]; + + StridedRandomAccessor keys_iter(keys_slice, keySliceStride); + StridedRandomAccessor values_iter(values_slice, valueSliceStride); + + namespace cub = ROCM_HIPCUB(at_cuda_detail::cub); + + CUDA_KERNEL_ASSERT(blockDim.x == C10_WARP_SIZE); + CUDA_KERNEL_ASSERT(blockDim.y <= max_block_dim_y); + constexpr int items_per_thread = sort_size / C10_WARP_SIZE; + static_assert( + items_per_thread * C10_WARP_SIZE == sort_size, + "sort_size must be a multiple of C10_WARP_SIZE"); + + + using LoadKeys = cub::WarpLoad; + using LoadValues = cub::WarpLoad; + using Sort = cub::WarpMergeSort; + using StoreKeys = cub::WarpStore; + using StoreValues = cub::WarpStore; + + __shared__ union { + typename LoadKeys::TempStorage load_keys; + typename LoadValues::TempStorage load_values; + typename Sort::TempStorage sort; + typename StoreKeys::TempStorage store_keys; + typename StoreValues::TempStorage store_values; + } tmp_storage[max_block_dim_y]; + + auto& warp_storage = tmp_storage[threadIdx.y]; + + // Load inputs + K local_keys[items_per_thread]; + V local_values[items_per_thread]; + + const auto invalid_value = V{}; + LoadKeys(warp_storage.load_keys).Load(keys_iter, local_keys, keySliceSize, invalid_key); + WARP_SYNC(); + LoadValues(warp_storage.load_values).Load(values_iter, local_values, keySliceSize, invalid_value); + WARP_SYNC(); + + // Sort! We use stable sort to ensure that invalid values are never + // sorted before valid values. In testing it performed the same as + // .Sort, so there is no down-side. + Sort(warp_storage.sort).StableSort( + local_keys, local_values, comp, keySliceSize, invalid_key); + WARP_SYNC(); + + // Store outputs + StoreKeys(warp_storage.store_keys).Store(keys_iter, local_keys, keySliceSize); + WARP_SYNC(); + StoreValues(warp_storage.store_values).Store(values_iter, local_values, keySliceSize); +} + +#endif // HAS_WARP_MERGE_SORT() + +template +C10_LAUNCH_BOUNDS_1(block_size) +__global__ void +radixSortKVInPlace(at::cuda::detail::TensorInfo keys, + IndexType keySlices, + IndexType keySliceSize, + IndexType keySliceStride, + at::cuda::detail::TensorInfo values, + IndexType valueSliceStride, + bool descending) { + static_assert(block_size > 0, ""); + + // Find the slice of the tensor that we are sorting + const IndexType linearIndex = getLinearBlockId(); + // Tiling the slices could have us be out of bounds, if there are a + // lot of slices to sort + if (linearIndex >= keySlices) { + return; + } + + const IndexType keyStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, keys); + const IndexType valueStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, values); + + K *keys_slice = &keys.data[keyStartOffset]; + V *values_slice = &values.data[valueStartOffset]; + + StridedRandomAccessor keys_iter(keys_slice, keySliceStride); + StridedRandomAccessor values_iter(values_slice, valueSliceStride); + + namespace cub = ROCM_HIPCUB(at_cuda_detail::cub); + + using key_t = typename at::cuda::cub::detail::cuda_type::type; + using LoadKeys = cub::BlockLoad; + using LoadValues = cub::BlockLoad; + using Sort = cub::BlockRadixSort; + using StoreKeys = cub::BlockStore; + using StoreValues = cub::BlockStore; + + __shared__ union { + typename LoadKeys::TempStorage load_keys; + typename LoadValues::TempStorage load_values; + typename Sort::TempStorage sort; + typename StoreKeys::TempStorage store_keys; + typename StoreValues::TempStorage store_values; + } tmp_storage; + + // cub's Block operations operate on a fixed number of items, but the + // actual slice we are sorting might be smaller. So, we need to make + // up the difference with keys that will always sort higher. + const K invalid_key = [descending] { + using radix_t = typename cub::Traits::UnsignedBits; + union { + K key; + radix_t radix; + } tmp; + tmp.radix = descending ? + cub::Traits::LOWEST_KEY : + cub::Traits::MAX_KEY; + return tmp.key; + }(); + const V invalid_value = static_cast(0); + + // Load inputs + K local_keys[items_per_thread]; + V local_values[items_per_thread]; + + LoadKeys(tmp_storage.load_keys).Load(keys_iter, local_keys, keySliceSize, invalid_key); + __syncthreads(); + LoadValues(tmp_storage.load_values).Load(values_iter, local_values, keySliceSize, invalid_value); + __syncthreads(); + + // Sort! + if (descending) { + Sort(tmp_storage.sort).SortDescending( + reinterpret_cast(local_keys), + local_values); + } else { + Sort(tmp_storage.sort).Sort( + reinterpret_cast(local_keys), + local_values); + } + __syncthreads(); + + // Store outputs + StoreKeys(tmp_storage.store_keys).Store(keys_iter, local_keys, keySliceSize); + __syncthreads(); + StoreValues(tmp_storage.store_values).Store(values_iter, local_values, keySliceSize); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Sorting.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Sorting.h new file mode 100644 index 0000000000000000000000000000000000000000..32697c85aa281e4aa1a4b305ef0a5551dacab1e7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Sorting.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +void launch_kthvalue_kernel( + const TensorBase &values, const TensorBase &indices, + const TensorBase &self, int64_t dim, int64_t k); +void launch_median_kernel( + const TensorBase &vals, const TensorBase &inds, + const TensorBase &in, int64_t dim, bool ignore_nan); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortingCommon.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortingCommon.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c80afbfdef4593be97846b2d35da1bd622ade0a6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortingCommon.cuh @@ -0,0 +1,198 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { + +// Is this questionable namespace pollution? +#if defined(USE_ROCM) +constexpr int MAX_BLOCK_SIZE = 256; + +#else +constexpr int MAX_BLOCK_SIZE = 1024; +#endif + +// Maximum size per grid dimension that we assume (compute capability >= 2.0) +constexpr int64_t MAX_GRID_SIZE = 65535LL; + +inline bool getGridFromTiles(int64_t gridTiles, dim3& grid) { + if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) { + return false; + } + + int64_t gridX = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; + int64_t gridY = 1; + int64_t gridZ = 1; + + if (gridTiles > MAX_GRID_SIZE) { + gridTiles = ceil_div(gridTiles, MAX_GRID_SIZE); + gridY = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; + + if (gridTiles > MAX_GRID_SIZE) { + gridTiles = ceil_div(gridTiles, MAX_GRID_SIZE); + gridZ = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; + } + } + + grid = dim3(gridX, gridY, gridZ); + return true; +} + +template +struct GTOp { + __device__ bool operator()(const scalar_t& lhs, const scalar_t& rhs) const { + return (handleNaN && at::_isnan(lhs) && !at::_isnan(rhs)) || + (static_cast(lhs) > static_cast(rhs)); + } +}; + +template +struct LTOp { + __device__ bool operator()(const scalar_t& lhs, const scalar_t& rhs) const { + return (handleNaN && at::_isnan(rhs) && !at::_isnan(lhs)) || + (static_cast(lhs) < static_cast(rhs)); + } +}; + +template +__device__ __forceinline__ index_t getLinearBlockId() { + return blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + + blockIdx.x; +} + +// For slice sorting in Thrust; extracts a slice index from a linear +// index and uses that for comparison +struct SliceComp { + SliceComp(int64_t size) : sliceSize(size) {} + + __device__ bool operator()(const int64_t& a, const int64_t& b) const { + // Since the slices are guaranteed to be innermost, + // the segment is just via int64_t division + int64_t segA = a / sliceSize; + int64_t segB = b / sliceSize; + return segA < segB; + } + + const int64_t sliceSize; +}; + +// For sorting in Thurst; extracts a within-slice index from a linear index +struct GlobalIndexToPerSliceIndex { + GlobalIndexToPerSliceIndex(int64_t size) : sliceSize(size) {} + + __device__ inline void operator()(int64_t& v) const { + v = v % sliceSize; + } + + const int64_t sliceSize; +}; + +// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks +inline uint64_t nextHighestPowerOf2(uint64_t n) { + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; +#ifndef _MSC_VER + n |= n >> 32; +#endif + n++; + + return n; +} + + +// WARNING: This function assumes input tensors are contiguous +template +void run_launcher( + const TensorBase &values, + const TensorBase &indices, + const TensorBase &self, + int64_t dim, + Launcher l) { + auto self_info = cuda::detail::getTensorInfo(self); + auto values_info = cuda::detail::getTensorInfo(values); + auto indices_info = cuda::detail::getTensorInfo(indices); + + int64_t slice_size = self.size(dim); + /* We use these structures solely to find the offset to */ + /* each slice we are operating on */ + self_info.reduceDim(dim); + values_info.reduceDim(dim); + indices_info.reduceDim(dim); + + /* Collapse all other dims */ + int collapse_self_dim = self_info.collapseDims(dim); + int collapse_values_dim = values_info.collapseDims(dim); + int collapse_indices_dim = indices_info.collapseDims(dim); + + int64_t num_slices = 1; + for (int i = 0; i < self_info.dims; ++i) { + num_slices *= self_info.sizes[i]; + } + + /* This is used as a template parameter to calculate indices. */ + /* We only specialize it if all collapsed dim sizes are the */ + /* same; otherwise, we use -1 which is the specialization */ + /* parameter for arbitrary dimensions */ + int all_dims = self_info.dims; + if (values_info.dims != all_dims || indices_info.dims != all_dims) { + all_dims = -1; + } + + if (all_dims == 1) { + l.template launch( + values_info, + collapse_values_dim, + indices_info, + collapse_indices_dim, + self_info, + collapse_self_dim, + num_slices, + slice_size); + } else if (all_dims == 2) { + l.template launch( + values_info, + collapse_values_dim, + indices_info, + collapse_indices_dim, + self_info, + collapse_self_dim, + num_slices, + slice_size); + } else if (all_dims == 3) { + l.template launch( + values_info, + collapse_values_dim, + indices_info, + collapse_indices_dim, + self_info, + collapse_self_dim, + num_slices, + slice_size); + } else { + l.template launch( + values_info, + collapse_values_dim, + indices_info, + collapse_indices_dim, + self_info, + collapse_self_dim, + num_slices, + slice_size); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortingRadixSelect.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortingRadixSelect.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1af2600814ffcafe11e24734b2c6a3d4b107aeec --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortingRadixSelect.cuh @@ -0,0 +1,432 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include + +namespace at::native { + +template +struct TopKTypeConfig {}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + // Converts a float to an integer representation with the same + // sorting; i.e., for floats f1, f2: + // if f1 < f2 then convert(f1) < convert(f2) + // We use this to enable radix selection of floating-point values. + // This also gives a relative order for NaNs, but that's ok, as they + // will all be adjacent + // neg inf: signbit=1 exp=ff fraction=0 --> radix = 0 00 ff.. + // pos inf: signbit=0 exp=ff fraction=0 --> radix = 1 ff 00.. + // pos nan: signbit=0 exp=ff fraction>0 --> radix = 1 ff x>0 + // neg nan: signbit=1 exp=ff fraction>0 --> radix = 0 00 x +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(uint8_t v) { + return v; + } + + static inline __device__ uint8_t deconvert(RadixType v) { + return v; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(int8_t v) { + return 128u + v; + } + + static inline __device__ int8_t deconvert(RadixType v) { + return v - 128; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(int16_t v) { + static_assert(sizeof(short) == 2, ""); + return 32768u + v; + } + + static inline __device__ int16_t deconvert(RadixType v) { + return v - 32768; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(int32_t v) { + static_assert(sizeof(int) == 4, ""); + return 2147483648u + v; + } + + static inline __device__ int32_t deconvert(RadixType v) { + return v - 2147483648u; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint64_t RadixType; + + static inline __device__ RadixType convert(int64_t v) { + static_assert(sizeof(int64_t) == 8, ""); + return 9223372036854775808ull + v; + } + + static inline __device__ int64_t deconvert(RadixType v) { + return v - 9223372036854775808ull; + } +}; + +template <> +struct TopKTypeConfig { + typedef uint64_t RadixType; + + static inline __device__ RadixType convert(double v) { + RadixType x = __double_as_longlong(v); + RadixType mask = -((x >> 63)) | 0x8000000000000000; + return (v == v) ? (x ^ mask) : 0xffffffffffffffff; + } + + static inline __device__ double deconvert(RadixType v) { + RadixType mask = ((v >> 63) - 1) | 0x8000000000000000; + return __longlong_as_double(v ^ mask); + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(at::Half v) { +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) + RadixType x = __half_as_ushort(v); + RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000; + return (v == v) ? (x ^ mask) : 0xffff; +#else + CUDA_KERNEL_ASSERT(false); + return 0u; +#endif + } + + static inline __device__ at::Half deconvert(RadixType v) { +#if defined(__CUDA_ARCH__) || defined(USE_ROCM) + RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff; + return __ushort_as_half(v ^ mask); +#else + CUDA_KERNEL_ASSERT(false); + return static_cast(0); +#endif + } +}; + +template <> +struct TopKTypeConfig { + typedef uint32_t RadixType; + + static inline __device__ RadixType convert(at::BFloat16 v) { + RadixType x = v.x; + RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000; + return (v == v) ? (x ^ mask) : 0xffff; + } + + static inline __device__ at::BFloat16 deconvert(RadixType v) { + RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff; + at::BFloat16 r; + r.x = (v ^ mask); + return r; + } +}; + +// This function counts the distribution of all input values in a +// slice we are selecting by radix digit at `radixDigitPos`, but only +// those that pass the filter `((v & desiredMask) == desired)`. +// This produces and broadcasts the seen counts for a single block only. +// `smem` must have at least `RadixSize` elements. +template < + typename scalar_t, + typename bitwise_t, + typename index_t, + typename CountType, + int RadixSize, + int RadixBits> +__device__ void countRadixUsingMask( + CountType counts[RadixSize], + CountType* smem, + bitwise_t desired, + bitwise_t desiredMask, + int radixDigitPos, + index_t sliceSize, + index_t withinSliceStride, + const scalar_t* data) { + // Clear out per-thread counts from a previous round +#pragma unroll + for (int i = 0; i < RadixSize; ++i) { + counts[i] = 0; + } + + if (threadIdx.x < RadixSize) { + smem[threadIdx.x] = 0; + } + __syncthreads(); + + // Scan over all the data. Upon a read, the warp will accumulate + // counts per each digit in the radix using warp voting. +#if !defined(USE_ROCM) + // Must be called outside of loop to ensure all threads participate + unsigned mask = WARP_BALLOT(threadIdx.x < sliceSize); +#endif + for (index_t i = threadIdx.x; i < sliceSize;) { + bitwise_t val = + TopKTypeConfig::convert(doLdg(&data[i * withinSliceStride])); + + bool hasVal = ((val & desiredMask) == desired); + bitwise_t digitInRadix = at::cuda::Bitfield::getBitfield( + val, radixDigitPos, RadixBits); + +#pragma unroll + for (uint32_t j = 0; j < RadixSize; ++j) { + bool vote = hasVal && (digitInRadix == j); +#if defined(USE_ROCM) + counts[j] += __popcll(WARP_BALLOT(vote)); +#else + counts[j] += __popc(WARP_BALLOT(vote, mask)); +#endif + } + i += blockDim.x; +#if !defined(USE_ROCM) + mask = WARP_BALLOT(i < sliceSize, mask); +#endif + } + + // Now, for each warp, sum values + if (at::cuda::getLaneId() == 0) { +#pragma unroll + for (uint32_t i = 0; i < RadixSize; ++i) { + gpuAtomicAddNoReturn(&smem[i], counts[i]); + } + } + + __syncthreads(); + + // For each thread, read in the total counts +#pragma unroll + for (uint32_t i = 0; i < RadixSize; ++i) { + counts[i] = smem[i]; + } + + __syncthreads(); +} + +// Over what radix we are selecting values +constexpr int RADIX_BITS = 2; // digits are base-(2 ^ RADIX_BITS) +constexpr int RADIX_SIZE = 4; // 2 ^ RADIX_BITS +constexpr int RADIX_MASK = (RADIX_SIZE - 1); + +// This finds the unique value `v` that matches the pattern +// ((v & desired) == desiredMask) in our sorted int format +template +__device__ scalar_t findPattern( + scalar_t* smem, + const scalar_t* data, + index_t sliceSize, + index_t withinSliceStride, + bitwise_t desired, + bitwise_t desiredMask) { + if (threadIdx.x < 2) { + smem[threadIdx.x] = static_cast(0); + } + __syncthreads(); + + // All threads participate in the loop, in order to sync on the flag + index_t numIterations = + round_up(sliceSize, static_cast(blockDim.x)); + for (index_t i = threadIdx.x; i < numIterations; i += blockDim.x) { + bool inRange = (i < sliceSize); + scalar_t v = inRange ? doLdg(&data[i * withinSliceStride]) + : static_cast(0); + + if (inRange && + ((TopKTypeConfig::convert(v) & desiredMask) == desired)) { + // There should not be conflicts if we are using findPattern, + // since the result is unique + smem[0] = static_cast(1); + smem[1] = v; // can't use val as the flag, since it could be 0 + } + + __syncthreads(); + + scalar_t found = smem[0]; + scalar_t val = smem[1]; + + __syncthreads(); + + // Check to see if a thread found the value + if (found != static_cast(0)) { + // all threads return this value + return val; + } + } + + // should not get here + CUDA_KERNEL_ASSERT(false); + return static_cast(0); +} + +// Returns the top-Kth element found in the data using radix selection +template +__device__ void radixSelect( + const scalar_t* data, + index_t k, + bool largest, + index_t sliceSize, + index_t withinSliceStride, + int* smem, + scalar_t* topK) { + // Per-thread buckets into which we accumulate digit counts in our + // radix + int counts[RADIX_SIZE]; + + // We only consider elements x such that (x & desiredMask) == desired + // Initially, we consider all elements of the array, so the above + // statement is true regardless of input. + bitwise_t desired = 0; + bitwise_t desiredMask = 0; + + // We are looking for the top kToFind-th element when iterating over + // digits; this count gets reduced by elimination when counting + // successive digits + int kToFind = k; + + // We start at the most significant digit in our radix, scanning + // through to the least significant digit + for (int digitPos = sizeof(scalar_t) * 8 - RADIX_BITS; digitPos >= 0; + digitPos -= RADIX_BITS) { + // Count radix distribution for the current position and reduce + // across all threads + countRadixUsingMask< + scalar_t, + bitwise_t, + index_t, + int, + RADIX_SIZE, + RADIX_BITS>( + counts, + smem, + desired, + desiredMask, + digitPos, + sliceSize, + withinSliceStride, + data); + + auto found_unique = [&](int i, int count) -> bool { + /* All threads have the same value in counts here, so all */ + /* threads will return from the function. */ + if (count == 1 && kToFind == 1) { + /* There is a unique answer. */ + desired = at::cuda::Bitfield::setBitfield( + desired, i, digitPos, RADIX_BITS); + desiredMask = at::cuda::Bitfield::setBitfield( + desiredMask, RADIX_MASK, digitPos, RADIX_BITS); + + /* The answer is now the unique element v such that: */ + /* (v & desiredMask) == desired */ + /* However, we do not yet know what the actual element is. We */ + /* need to perform a search through the data to find the */ + /* element that matches this pattern. */ + *topK = findPattern( + (scalar_t*)smem, + data, + sliceSize, + withinSliceStride, + desired, + desiredMask); + return true; + } + return false; + }; + auto found_non_unique = [&](int i, int count) -> bool { + if (count >= kToFind) { + desired = + at::cuda::Bitfield::setBitfield( + desired, i, digitPos, RADIX_BITS); + desiredMask = at::cuda::Bitfield::setBitfield( + desiredMask, RADIX_MASK, digitPos, RADIX_BITS); + + /* The top-Kth element v must now be one such that: */ + /* (v & desiredMask == desired) */ + /* but we haven't narrowed it down; we must check the next */ + /* least-significant digit */ + return true; + } + kToFind -= count; + return false; // continue the loop + }; + + // All threads participate in the comparisons below to know the + // final result + if (largest) { + // Process in descending order +#pragma unroll + for (int i = RADIX_SIZE - 1; i >= 0; --i) { + int count = counts[i]; + if (found_unique(i, count)) { + return; + } + if (found_non_unique(i, count)) { + break; + } + } + } else { + // Process in ascending order +#pragma unroll + for (int i = 0; i < RADIX_SIZE; ++i) { + int count = counts[i]; + if (found_unique(i, count)) { + return; + } + if (found_non_unique(i, count)) { + break; + } + } + } + } // end digitPos for + + // There is no unique result, but there is a non-unique result + // matching `desired` exactly + *topK = TopKTypeConfig::deconvert(desired); +} +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c5be7518ea83713d9cee9699ea39ad9896e81d22 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh @@ -0,0 +1,436 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +// Used for a segmented reduction +struct ModeUnsignedBoolPair { + unsigned int val; + bool flag; +}; + +// In the kernel below, we have a common pattern of reducing (unsigned int, +// unsigned int) pairs of data +struct ModeUnsignedPair { + unsigned int val; + unsigned int index; +}; + +// Inclusive Scan via an upsweep/downsweep mechanism. Assumes: +// +// 1. Power2ScanSize is a power of 2. This code still works for collections that +// do not exactly contain a power of 2 number of elements, simply round up to +// the nearest power of 2 and then call. +// +// 2. That there are two-elements per thread, i.e. the size of the smem storage +// is 2 * blockDim.x * sizeof(T). +// +// Consider a (+)-Scan on the following elements: +// +// Upsweep: +// +// 0 1 2 3 4 5 6 7 +// 1 5 9 13 +// 6 22 +// 28 +// +// Downsweep: +// 15 +// 3 10 21 +template +__device__ void inclusivePrefixScan(T* smem, BinaryOp binop) { + // Reduce step ("upsweep") +#pragma unroll + for (int stride = 1; stride < Power2ScanSize; stride <<= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if (index < Power2ScanSize) { + smem[index] = binop(smem[index], smem[index - stride]); + } + __syncthreads(); + } + + // Post-reduce step ("downsweep") +#pragma unroll + for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if ((index + stride) < Power2ScanSize) { + smem[index + stride] = binop(smem[index + stride], smem[index]); + } + __syncthreads(); + } +} + +// Block-wide reduction where each thread locally reduces N +// values before letting a single warp take over - assumes +// threadVals is in registers, not shared memory +// +// If smem is not used again, there is no need to __syncthreads before this +// call. However, if smem will be used, e.g., this function is called in a loop, +// then __syncthreads is needed either before or afterwards to prevent non-0 +// threads overriding smem in the next loop before num-0 thread reads from it. +template +__device__ T reduceBlockWithNThreadLocalReductions( + T* smem, + T threadVals[N], + const unsigned int numVals, + ReduceOp reduceOp, + T init) { + int offset = threadIdx.x * N; + T local = offset < numVals ? threadVals[0] : init; + +#pragma unroll + for (int i = 1; i < N; ++i) { + ++offset; + T next = offset < numVals ? threadVals[i] : init; + local = reduceOp.combine(local, next); + } + + return cuda_utils::BlockReduce(local, reduceOp, init, smem); +} + +template +__device__ inline void swapVars(T& t1, T& t2) { + T tmp = t1; + t1 = t2; + t2 = tmp; +} + +template +__device__ inline void bitonicSwap( + K& kA, + V& vA, + bool& validA, + K& kB, + V& vB, + bool& validB, + bool dir, + const Comparator& comp) { + // Invalid entries always sort to the end + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(vA, vB); + swapVars(validA, validB); + } +}; + +template +__device__ inline void bitonicSwapKeys( + K& kA, + bool& validA, + K& kB, + bool& validB, + bool dir, + const Comparator& comp) { + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(validA, validB); + } +} + +template < + typename K, + typename IndexType, + int Power2SortSize, + typename Comparator> +__device__ inline void bitonicSortKeys( + K keys[Power2SortSize], + bool valid[Power2SortSize], + const Comparator& comp) { +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int size = 2; size < Power2SortSize; size *= 2) { + bool flag = ((threadIdx.x & (size / 2)) != 0); + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = size / 2; stride > 0; stride /= 2) { + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwapKeys( + keys[pos], + valid[pos], + keys[pos + stride], + valid[pos + stride], + flag, + comp); + } + } + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwapKeys( + keys[pos], + valid[pos], + keys[pos + stride], + valid[pos + stride], + false, + comp); + } + + __syncthreads(); +} + +// The mode kernel has the following characteristics: It uses internal shared +// memory buffers of Power2Size, which must be greater than the number of +// elements. Additionally, there is one block for every slice to calculate the +// mode for, and in each block there is one thread for every two elements. +// +// Both sorted and positions are assumed to be contiguous Tensors with the mode +// dimension as the innermost dim, such that we can get the particular slice for +// a Tensor via its linear block dimension * the slice size. +template +__launch_bounds__(1024, 1) +__global__ void compute_mode( + const T* input, + at::cuda::detail::TensorInfo values, + at::cuda::detail::TensorInfo indices, + int64_t sliceSize, + int64_t slices) { + int tidx = threadIdx.x; + int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for + + // First, we need to calculate the offset into the sorted Tensor that + // represents the start of the slice for this block to calculate the mode for. + // This offset is a combination of the gridIndices, and the number of elements + // in the slice. + unsigned int blockId = getLinearBlockId(); + unsigned int linearOffset = blockId * sliceSize; + + if (blockId >= slices) { + return; + } + + // shmem is a dynamically sized buffer we will use throughout the kernel to + // handle computation efficiently. The size of this shmem must be + // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size) + // + // Initially, the buffer will be organized as follows: + // + // [smem (slice elements) | bmem (valid indices) | ] + extern __shared__ char shmem[]; + + // smem represents a proportion of the shared memory buffer that is used to + // store the elements from the slice: + T* smem = reinterpret_cast(shmem); + + // Each thread loads up to two elements from the Tensor into shared memory + if (tidx < sliceSize) { + smem[tidx] = c10::load(&input[linearOffset + tidx]); + } + if (stidx < sliceSize) { + smem[stidx] = c10::load(&input[linearOffset + stidx]); + } + + // Next, we initialize a boolean region of the buffer, offset by the loaded + // element smem region + bool* bmem = reinterpret_cast(&smem[Power2Size]); + + // The first use of this region stores bmem[i] = i < sliceSize to mark the + // valid components in the smem buffer + bmem[tidx] = tidx < sliceSize; + bmem[stidx] = stidx < sliceSize; + __syncthreads(); // barrier for smem, bmem initialization + + // First, sort the input slice in ascending order. smem contains the input + // elements, and bmem marks the valid indices + bitonicSortKeys( + smem, bmem, [&] GPU_LAMBDA(const auto& a, const auto& b) { + return a < b; + }); + __syncthreads(); // make no assumptions that the sort syncs at end + + // The next step of our algorithm is performing a block-wide comparison of + // neighboring elements. In particular, given an sorted input slice A, we + // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise + // 0. + // + // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8] + // B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1] + // + // In particular, we can think of B[i] true indicating the start of a sequence + // of equal values in the sorted list. Similarly, we will also store the + // negation of B, which we'll call C. In particular, we can think of C[i] = + // true iff A[i-1] == A[i] in our original sorted slice. + // + // C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0] + + // We overwrite bmem, and treat the rest of shared memory as a buffer of + // (index, flag) pairs where the index represents values from C, and the flag + // represents values from B. + // + // [smem (sorted slice) | ubpmem (index, flag pairs)] + + struct ModeUnsignedBoolPair* ubpmem = + reinterpret_cast(&smem[Power2Size]); + + if (tidx == 0) { + ubpmem[0].flag = true; + ubpmem[0].val = 0; + } + + // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ... + ubpmem[tidx * 2 + 1].flag = + smem[tidx * 2] != smem[tidx * 2 + 1]; // (0, 1), (1, 2), etc. + ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag; + + // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ... + if (((tidx + 1) * 2) < Power2Size) { + ubpmem[(tidx + 1) * 2].flag = + smem[((tidx + 1) * 2) - 1] != smem[(tidx + 1) * 2]; + ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag; + } + __syncthreads(); // barrier for ubpmem initialization + + // Next, we perform a segmented prefix sum on the neighboring elements, where + // the presence of a one indicates the start of a segment. In this case B acts + // as the segment start flags, and C is the buffer to be summed: + // + // Input (C) = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0] + // Flag (B) = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1] + // Output (C) = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0] + // + // Afterwards, the (index) components of the ubpmem buffer contain the lengths + // of the segments (minus 1), i.e. the counts of each element in the original + // input. + inclusivePrefixScan( + ubpmem, [=] GPU_LAMBDA(const auto& a, const auto& b) { + ModeUnsignedBoolPair c; + c.val = a.flag ? a.val : a.val + b.val; + c.flag = a.flag | b.flag; + return c; + }); + // assumes scan syncs at the end + + // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e. + // we treat the boolean flag regions as integers). We initialize these to + // represent indices, and we'll call this buffer I + struct ModeUnsignedPair* uupmem = + reinterpret_cast(ubpmem); + + // At this point, we need to find the maximum element in lengths buffer C. + // This element will represent the count (-1) of the mode. Because of the + // way we have set up the problem, the index where this mode occurs will + // also be the location of the mode value in the sorted array, e.g. + // + // smem = [0, 0, 1, 1, 1, 2] + // C = [0, 1, 0, 1, 2, 0] + // I = [0, 1, 2, 3, 4, 5] + // ^ + // maximum value, also aligned with mode = 1 + // + // We perform a block wide max-reduction of the C buffer, but we also need the + // indices to come along with it, so we utilize the uupmem construction. + // + // At the end we need to return the ModeUnsignedPair containing index = 4, val + // = 2, which represents the max + + // In practice, we will make each thread locally reduce 2 values in its + // registers prior to the global block-wide reduction. Note that instead of + // tidx/stidx, we utilize tidx * 2, tidx * 2 + 1, so each thread deals with + // adjacent elements. This is because the reduce code below relies on thread + // elements to be adjacent. + struct ModeUnsignedPair uup[2]; + uup[0].index = tidx * 2; + uup[0].val = ubpmem[tidx * 2].val; + uup[1].index = tidx * 2 + 1; + uup[1].val = ubpmem[tidx * 2 + 1].val; + __syncthreads(); + + struct ModeUnsignedPair max = {0, 0}; + + struct MaxOp { + inline __device__ ModeUnsignedPair combine(ModeUnsignedPair a, ModeUnsignedPair b) const { + return b.val > a.val ? b : a; + } + + inline __device__ ModeUnsignedPair warp_shfl_down(ModeUnsignedPair acc, int offset) const { + ModeUnsignedPair ret; + ret.index = WARP_SHFL_DOWN(acc.index, offset); + ret.val = WARP_SHFL_DOWN(acc.val, offset); + return ret; + } + } max_op; + + max = reduceBlockWithNThreadLocalReductions<2>( + uupmem, + uup, + sliceSize, + max_op, + max); + + // Store the mode in shared memory for use in finding the mode in the input + // slice + __shared__ T mode; + + // Given the above constraints, the mode is the value at the reduced index in + // the original sorted element buffer + if (tidx == 0) { + mode = smem[max.index]; + } + __syncthreads(); // broadcast mode + + // Finally, we need to find "an" index of the mode in the input + // Tensor. The API does not constrain which index we pick, but here + // we always pick the largest index. We store the index if the value + // is the mode, or 0 otherwise. Then find the maximum value. + // + // Again we reduce 2 elements in the thread's registers prior to the + // block-wide reduction + unsigned mode_index[2] = {0u, 0u}; + if (tidx * 2 < sliceSize) { + const unsigned idx = tidx * 2; + mode_index[0] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u; + } + if (tidx * 2 + 1 < sliceSize) { + const unsigned idx = tidx * 2 + 1; + mode_index[1] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u; + } + + struct MaxIndexOp { + inline __device__ unsigned combine(unsigned a, unsigned b) const { + return b > a ? b : a; + } + + inline __device__ unsigned warp_shfl_down(unsigned acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } + } max_index_op; + + int64_t index = reduceBlockWithNThreadLocalReductions<2>( + reinterpret_cast(&shmem[0]), + mode_index, + sliceSize, + max_index_op, + 0u); + + // Finally, we have the mode, and an index where it occurs. We use a single + // thread to place this in the appropriate output position + if (tidx == 0) { + unsigned int outputOffset = + at::cuda::detail::IndexToOffset::get( + blockId, values); + values.data[outputOffset] = mode; + indices.data[outputOffset] = index; + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..7efd68a5aa0316bc02e639f5382d3ed9fe2b4ef3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h @@ -0,0 +1,23 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +void launch_fused_mode_kernel( + const TensorBase &values, const TensorBase &indices, + const TensorBase &self, int64_t slice_size, int64_t slices); + +void launch_apply_mode_kernel( + const TensorBase &values, const TensorBase &indices, + const TensorBase &self, int64_t dim, int64_t ndim); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorTopK.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorTopK.h new file mode 100644 index 0000000000000000000000000000000000000000..01ae160f962d24e622663d1ed16489ed147c6c44 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorTopK.h @@ -0,0 +1,18 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class TensorBase; +} + +namespace at::native { +void launch_gather_topk_kernel( + const TensorBase& self, + int64_t k, int64_t dim, bool largest, + const TensorBase& values, const TensorBase& indices); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1fffc057d29b50f533870dfd5b276677ee523b9e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +namespace at::native::internal { + +template +std::tuple unique_cuda_template( + const Tensor& self, + const bool consecutive, + const bool return_inverse, + const bool return_counts); + +} // namespace at::native::internal + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/UpSample.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/UpSample.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ca43947121b4f085038d2fc652786df8fd31ba79 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/UpSample.cuh @@ -0,0 +1,373 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +#include +#include +#include + +#include +#include + +namespace at::native { + +namespace upsample { +// TODO: Remove duplicate declaration. +TORCH_API c10::SmallVector compute_output_size( + c10::IntArrayRef input_size, // Full input tensor size. + at::OptionalIntArrayRef output_size, + std::optional> scale_factors); +} // namespace upsample + +namespace upsample_cuda { + +// TODO: Remove duplication with Upsample.h (CPU). +inline std::optional get_scale_value(std::optional> scales, int idx) { + if (!scales) { + return std::nullopt; + } + return scales->at(idx); +} + +} // namespace upsample_cuda + + +/* TODO: move this to a common place */ +template +__device__ inline scalar_t min(scalar_t a, scalar_t b) { + return a < b ? a : b; +} + +template +__device__ inline scalar_t max(scalar_t a, scalar_t b) { + return a > b ? a : b; +} + +// NOTE [ Nearest neighbor upsampling kernel implementation ] +// +// The nearest neighbor upsampling kernel implementation is symmetrical as +// expected. We launch kernels with threads mapping to destination tensors where +// kernels write data to, each thread reads data from the source tensor, this +// means: +// 1. In the forward kernel, +// src_xxx refers to properties of input tensors; +// dst_xxx refers to properties of output tensors; +// scale_factor is the ratio of src_size to dst_size; +// 2. In the backward kernel, +// src_xxx refers to properties of grad_output tensors; +// dst_xxx refers to properties of grad_input tensors; +// scale_factor is the ratio of src_size to dst_size; +// +// Because of this, we need to take the reciprocal of the scale defined by +// upsample layer during forward path. The motivation is to avoid slow +// division in the kernel code, so we can use faster multiplication instead. +// This is not necessary during backward path, since the scale_factor is already +// the reciprocal of corresponding scale_factor used in the forward path due to +// the swap of source and destination tensor. +// +// Similarly, since the mapping from grad_input to grad_output during backward +// is the reverse of the mapping of output to input, we need to have opposite +// mapping functions to compute the source index. + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +template +__host__ __forceinline__ accscalar_t compute_scales_value( + const std::optional scale, + int64_t src_size, + int64_t dst_size) { + // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. + return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)(1.0 / scale.value()) + : (accscalar_t)src_size / dst_size; +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +template +__host__ __forceinline__ accscalar_t compute_scales_value_backwards( + const std::optional scale, + int64_t src_size, + int64_t dst_size) { + // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. + return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)scale.value() + : (accscalar_t)src_size / dst_size; +} + +template +__host__ __forceinline__ accscalar_t area_pixel_compute_scale( + int input_size, + int output_size, + bool align_corners, + const std::optional scale) { + if(align_corners) { + if(output_size > 1) { + return (accscalar_t)(input_size - 1) / (output_size - 1); + } + else { + return static_cast(0); + } + } + else{ + return compute_scales_value(scale, input_size, output_size); + } +} + +template +__device__ __forceinline__ accscalar_t area_pixel_compute_source_index( + accscalar_t scale, + int dst_index, + bool align_corners, + bool cubic) { + if (align_corners) { + return scale * dst_index; + } else { + accscalar_t src_idx = scale * (dst_index + static_cast(0.5)) - + static_cast(0.5); + // See Note[Follow Opencv resize logic] + return (!cubic && src_idx < static_cast(0)) + ? static_cast(0) + : src_idx; + } +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +__device__ __forceinline__ int nearest_neighbor_compute_source_index( + const float scale, + int dst_index, + int input_size) { + // index_f32 = (output_index) * scale + // input_index = round(index_f32) + // Same as a buggy OpenCV INTER_NEAREST + // We keep this method for BC and consider as deprecated. + // See nearest_neighbor_exact_compute_source_index as replacement + const int src_index = + min(static_cast(floorf((dst_index) * scale)), input_size - 1); + return src_index; +} + +__device__ __forceinline__ int nearest_neighbor_exact_compute_source_index( + const float scale, + int dst_index, + int input_size) { + // index_f32 = (output_index + 0.5) * scale - 0.5 + // input_index = round(index_f32) + // Same as Pillow and Scikit-Image/Scipy ndi.zoom + const int src_index = + min(static_cast(floorf((dst_index + static_cast(0.5)) * scale)), input_size - 1); + return src_index; +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +__device__ __forceinline__ int nearest_neighbor_bw_compute_source_index( + const float scale, + int dst_index, + int output_size) { + // Equivalent to buggy OpenCV INTER_NEAREST + // We keep this method for BC and consider as deprecated. + // See nearest_neighbor_exact_bw_compute_source_index as replacement + const int src_index = + min(static_cast(ceilf(dst_index * scale)), output_size); + return src_index; +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +__device__ __forceinline__ int nearest_neighbor_exact_bw_compute_source_index( + const float scale, + int dst_index, + int output_size) { + // Equivalent to Pillow and Scikit-Image/Scipy ndi.zoom + const int src_index = + min(static_cast(ceilf(dst_index * scale - static_cast(0.5))), output_size); + return src_index; +} + +/* Used by UpSampleBicubic2d.cu */ +template +__device__ __forceinline__ scalar_t upsample_get_value_bounded( + const PackedTensorAccessor64& data, + int batch, + int channel, + int height, + int width, + int y, + int x) { + int access_y = max(min(y, height - 1), 0); + int access_x = max(min(x, width - 1), 0); + return data[batch][channel][access_y][access_x]; +} + +/* Used by UpSampleBicubic2d.cu */ +template +__device__ __forceinline__ void upsample_increment_value_bounded( + PackedTensorAccessor64& data, + int batch, + int channel, + int height, + int width, + int y, + int x, + accscalar_t value) { + int access_y = max(min(y, height - 1), 0); + int access_x = max(min(x, width - 1), 0); + /* TODO: result here is truncated to scalar_t, + check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912 + */ + gpuAtomicAddNoReturn( + &data[batch][channel][access_y][access_x], static_cast(value)); +} + +// Based on +// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm +template +__device__ __forceinline__ accscalar_t cubic_convolution1( + accscalar_t x, + accscalar_t A) { + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +template +__device__ __forceinline__ accscalar_t cubic_convolution2( + accscalar_t x, + accscalar_t A) { + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +template +__device__ __forceinline__ void get_cubic_upsampling_coefficients( + accscalar_t coeffs[4], + accscalar_t t) { + accscalar_t A = -0.75; + + accscalar_t x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0, A); + coeffs[1] = cubic_convolution1(x1, A); + + // opposite coefficients + accscalar_t x2 = 1.0 - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0, A); +} + +template +__device__ __forceinline__ accscalar_t cubic_interp1d( + scalar_t x0, + scalar_t x1, + scalar_t x2, + scalar_t x3, + accscalar_t t) { + accscalar_t coeffs[4]; + get_cubic_upsampling_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +namespace upsample_antialias { + +// taken from +// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/ +// src/libImaging/Resample.c#L20-L29 +struct BilinearFilterFunctor { + + template + __device__ accscalar_t operator()(accscalar_t x) const { + if (x < 0) { + x = -x; + } + if (x < 1) { + return 1 - x; + } + return 0; + } + + static constexpr int size = 2; +}; + +// taken from +// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/ +// src/libImaging/Resample.c#L46-L62 +struct BicubicFilterFunctor { + + template + __device__ accscalar_t operator()(accscalar_t x) const { + // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm + const accscalar_t a = -0.5; + if (x < 0) { + x = -x; + } + if (x < 1) { + return ((a + 2) * x - (a + 3)) * x * x + 1; + } + if (x < 2) { + return (((x - 5) * x + 8) * x - 4) * a; + } + return 0; + } + + static constexpr int size = 4; +}; + +template +__device__ __forceinline__ void _compute_weights_span( + const int i, + const int input_size, + const accscalar_t scale, + const accscalar_t support, + int& xmin, + int& xsize, + accscalar_t& center) { + center = scale * (i + static_cast(0.5)); + xmin = max(static_cast(center - support + static_cast(0.5)), static_cast(0)); + xsize = min(static_cast(center + support + static_cast(0.5)), input_size) - xmin; +} + +template +__device__ __forceinline__ void _compute_weights( + scalar_t* wt_ptr, + const accscalar_t scale, + int interp_size, + const interp_filter_t& interp_filter, + accscalar_t xmin_m_center, + int xsize) { + + accscalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0; + accscalar_t total_w = 0.0; + int j = 0; + for (j = 0; j < xsize; j++) { + accscalar_t w = interp_filter((j + xmin_m_center + static_cast(0.5)) * invscale); + wt_ptr[j] = static_cast(w); + total_w += w; + } + for (j = 0; j < xsize; j++) { + if (total_w != 0.0) { + wt_ptr[j] /= total_w; + } + } + for (; j < interp_size; j++) { + wt_ptr[j] = static_cast(0.0); + } +} + +template +__device__ __forceinline__ accscalar_t interpolate_aa_single_dim( + const scalar_t* src, + const scalar_t* weights, + int size) { + scalar_t t = static_cast(*src); + scalar_t wts = static_cast(weights[0]); + accscalar_t output = t * wts; + + int j = 1; + for (; j < size; j++) { + wts = static_cast(weights[j]); + t = static_cast(*(src + j)); + output += t * wts; + } + return output; +} + +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/block_reduce.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/block_reduce.cuh new file mode 100644 index 0000000000000000000000000000000000000000..781567459af04a897b25217bdc5e8096c26a0190 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/block_reduce.cuh @@ -0,0 +1,152 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native::cuda_utils { + +constexpr int kCUDABlockReduceNumThreads = 512; +// Algorithmic limitation: BlockReduce does two WarpReduce calls, each +// of which reduces C10_WARP_SIZE elements. So, at most +// C10_WARP_SIZE**2 elements can be reduced at a time. +// NOTE: This is >= the max block size on current hardware anyway (1024). +// ROCm NOTE: C10_WARP_SIZE should only be used inside device functions, +// and kCUDABlockReduceMaxThreads is a host-side variable. +#ifdef USE_ROCM +static int kCUDABlockReduceMaxThreads() { + return at::cuda::warp_size() * at::cuda::warp_size(); +} +#else +constexpr int kCUDABlockReduceMaxThreads() { + return C10_WARP_SIZE * C10_WARP_SIZE; +} +#endif + +// Sums `val` across all threads in a warp. +// +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +// Picks the maximum `val` across all threads in a warp. +// +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +template +__inline__ __device__ T WarpReduceMax(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val = max_propagate_nan(val, WARP_SHFL_DOWN(val, offset)); + } + return val; +} + +struct Block1D { + static __forceinline__ __device__ int Tid() { return threadIdx.x; } + + static __forceinline__ __device__ int Warps() { + return blockDim.x / C10_WARP_SIZE; + } +}; + +struct Block2D { + static __forceinline__ __device__ int Tid() { + return threadIdx.x + threadIdx.y * blockDim.x; + } + + static __forceinline__ __device__ int Warps() { + return blockDim.x * blockDim.y / C10_WARP_SIZE; + } +}; + +// Sums `val` across all threads in a block. +// +// Warning: the return value is only valid for thread 0. +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +// - `shared` should be a pointer to shared memory with size of, at least, +// `sizeof(T) * number_of_warps` +template +__inline__ __device__ T BlockReduceSum(T val, T* shared) { + const int tid = B::Tid(); + const int lid = tid % C10_WARP_SIZE; + const int wid = tid / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); // prevent races when BlockReduces are called in a row. + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (tid < B::Warps()) ? shared[lid] : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +// Picks out the maximum `val` across all threads in a block. +// +// Warning: the return value is only valid for thread 0. +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +// - `shared` should be a pointer to shared memory with size of, at least, +// `sizeof(T) * number_of_warps` +template +__inline__ __device__ T BlockReduceMax(T val, T* shared) { + const int tid = B::Tid(); + const int lid = tid % C10_WARP_SIZE; + const int wid = tid / C10_WARP_SIZE; + val = WarpReduceMax(val); + __syncthreads(); // prevent races when BlockReduces are called in a row. + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (tid < B::Warps()) ? shared[lid] : T(std::numeric_limits::lowest()); + if (wid == 0) { + val = WarpReduceMax(val); + } + return val; +} + +template +__inline__ __device__ T WarpReduce(T val, const ReduceOp& op) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val = op.combine(val, op.warp_shfl_down(val, offset)); + } + return val; +} + +template +__inline__ __device__ T +BlockReduce(T val, const ReduceOp& op, const T& identity_element, T* shared) { + const int tid = B::Tid(); + const int lid = tid % C10_WARP_SIZE; + const int wid = tid / C10_WARP_SIZE; + val = WarpReduce(val, op); + __syncthreads(); // prevent races when BlockReduces are called in a row. + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (tid < B::Warps()) ? shared[lid] : identity_element; + if (wid == 0) { + val = WarpReduce(val, op); + } + return val; +} + +} // namespace at::native::cuda_utils + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/cuBlasCommonArgs.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/cuBlasCommonArgs.h new file mode 100644 index 0000000000000000000000000000000000000000..196cdbfb62bde97e8dc8dc577d7a31062b6d71ae --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/cuBlasCommonArgs.h @@ -0,0 +1,176 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +using at::blas::ScalingType; +using at::blas::SwizzleType; + +namespace { + +// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492 +c10::MaybeOwned inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) { + if (resolve_conj && tensor.is_conj()) { + return c10::MaybeOwned::owned(tensor.resolve_conj()); + } else { + return c10::MaybeOwned::borrowed(tensor); + } +} + +c10::MaybeOwned inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) { + if (tensor.is_non_overlapping_and_dense()) { // common case + transpose_tensor = tensor.is_contiguous(); + return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor); + } + IntArrayRef tensor_strides = tensor.strides(); + IntArrayRef tensor_sizes = tensor.sizes(); + if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max(1, tensor_sizes[0]))) { + transpose_tensor = false; + return resolve_conj_if_indicated(tensor, !transpose_result); + } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max(1, tensor_sizes[1]))) { + transpose_tensor = true; + return resolve_conj_if_indicated(tensor, transpose_result); + } else { + transpose_tensor = true; + return c10::MaybeOwned::owned(tensor.clone(at::MemoryFormat::Contiguous)); + } +} + +c10::MaybeOwned inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) { + if (tensor.is_non_overlapping_and_dense()) { // common case + transpose_tensor = tensor.is_contiguous(); + return resolve_conj_if_indicated(tensor, true); + } + + IntArrayRef tensor_strides = tensor.strides(); + IntArrayRef tensor_sizes = tensor.sizes(); + if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max(1, tensor_sizes[0]))) { + transpose_tensor = false; + return resolve_conj_if_indicated(tensor, true); + } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max(1, tensor_sizes[1]))) { + transpose_tensor = true; + return resolve_conj_if_indicated(tensor, true); + } else { + transpose_tensor = true; + return c10::MaybeOwned::owned(tensor.clone(at::MemoryFormat::Contiguous)); + } +} + +} // namespace + +/** + * @brief Prepares matrices for CUBLAS operation + * + * This constructor prepares tensors for CUBLAS + * The main difference is that PyTorch uses row-major as the default and + * CUBLAS expects column-major. + * + * @details + * To enable row-major output while using CUBLAS, + * we use the mathematical identity that (A × B)^T = B^T × A^T. + * + * Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major) + * T = row-major, N = col-major + * + * Example: + * For matrices A (M×K)(row-major) and B (K×N)(row-major): + * - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major) + * - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N) + * - However, since the output form cublas is column-major this is + * - equivalent to an output of size MxN row-major as expected + * + * The transpose flags are derived from the layouts of the passed in tensors + * + * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted + * to their unpacked values to match what cuBLAS expects. + * + * @param mat1 First input matrix + * @param mat2 Second input matrix + * @param c Output matrix (result) + * @param scale_a Optional scaling factor for first matrix + * @param scale_b Optional scaling factor for second matrix + * @param scale_result Optional scaling factor for result + */ +struct cublasCommonArgs { + cublasCommonArgs( + const Tensor& mat1, + const Tensor& mat2, + Tensor& c, + const std::optional& scale_a = std::nullopt, + const std::optional& scale_b = std::nullopt, + const std::optional& scale_result = std::nullopt, + const std::optional& scaling_choice_a = std::nullopt, + const std::optional& scaling_choice_b = std::nullopt) { + bool transpose_result = false, transpose_a = false, transpose_b = false; + result = prepare_matrix_for_cublas(c, transpose_result); + mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); + matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); + + // Handle scale tensors if provided + if (scale_a && scale_b) { + // By default since we return in row-major we run the gemm + // as B.T @ A.T, check transpose_result to determine if we flip the scales + scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); + scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); + scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; + scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); + scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); + scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; + } + + if (scale_result) { + scale_result_ptr = scale_result->data_ptr(); + scale_result_dtype = scale_result->scalar_type(); + } + + // Update transpose flags + if (transpose_result) { + transpose_a = !transpose_a; + transpose_b = !transpose_b; + } + + auto sizes_a = mata->sizes(); + auto sizes_b = matb->sizes(); + + m = sizes_a[transpose_result ? 1 : 0]; + k = sizes_a[transpose_result ? 0 : 1]; + n = sizes_b[transpose_result ? 0 : 1]; + lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); + ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); + result_ld = result->stride(transpose_result ? 0 : 1); + transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; + transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; + + // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing + // if the gemm operands are in packed float4 + if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) { + k = k * 2; + lda = lda * 2; + ldb = ldb * 2; + } + } + + // Matrix members + char transa, transb; + int64_t m, n, k; + int64_t lda, ldb, result_ld; + c10::MaybeOwned mata, matb, result; + + // Scale members + void* scale_mata_ptr = nullptr; + void* scale_matb_ptr = nullptr; + void* scale_result_ptr = nullptr; + std::optional scale_mata_dtype; + std::optional scaling_mata_type; + std::optional scale_matb_dtype; + std::optional scaling_matb_type; + std::optional scale_result_dtype; +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/cutlass_common.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/cutlass_common.cuh new file mode 100644 index 0000000000000000000000000000000000000000..bd2c021b4c96d32bec0321bf5c9cfe52d7f45cbc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/cutlass_common.cuh @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::cuda::detail { + +template +struct enable_2x_kernel_for_sm89 : Kernel { + template + CUTLASS_DEVICE static void invoke(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 890 + Kernel::invoke(std::forward(args)...); +#endif + } +}; + +template +struct enable_3x_kernel_for_sm9x : Kernel { + template + CUTLASS_DEVICE void operator()(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900 && __CUDA_ARCH__ < 1000 + Kernel::operator()(std::forward(args)...); +#endif + } +}; + +template +struct enable_3x_kernel_for_sm10 : Kernel { + template + CUTLASS_DEVICE void operator()(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 1000 && __CUDA_ARCH__ < 1200 + Kernel::operator()(std::forward(args)...); +#endif + } +}; + +template +struct enable_3x_kernel_for_sm10_or_later : Kernel { + template + CUTLASS_DEVICE void operator()(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 1000 + Kernel::operator()(std::forward(args)...); +#endif + } +}; + +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adagrad_impl.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adagrad_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a02bc34d0584acfc7371892ac7685de51da99286 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adagrad_impl.cuh @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +void _fused_adagrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList state_sums, + at::TensorList state_steps, + const double lr, + const double lr_decay, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adagrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList state_sums, + at::TensorList state_steps, + const at::Tensor& lr, + const double lr_decay, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adagrad_utils.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adagrad_utils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..0c77db3160f9d2828d8c578d3e9dee22d810d020 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adagrad_utils.cuh @@ -0,0 +1,141 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + +namespace at::native { + +namespace { + +constexpr uint8_t kParamIdx = 0; +constexpr uint8_t kGradIdx = 1; +constexpr uint8_t kStateSumIdx = 2; + +template +C10_DEVICE inline void adagrad_math( + scalar_t r_args[3][kILP], + const double& corrected_lr, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr) { +#pragma unroll + for (int ii = 0; ii < kILP; ++ii) { + opmath_t param = static_cast(r_args[kParamIdx][ii]); + opmath_t grad = static_cast(r_args[kGradIdx][ii]); + opmath_t state_sum = static_cast(r_args[kStateSumIdx][ii]); + + if (grad_scale_ptr) { + grad /= (static_cast(*grad_scale_ptr)); + } + const opmath_t grad_to_store = grad; + if (maximize) { + grad = -grad; + } + if (weight_decay != 0) { + grad += param * weight_decay; // Can I change this to use std::fma? + } + state_sum += grad * grad; // Can I change this to use std::fma? + param = param - corrected_lr * grad / (std::sqrt(state_sum) + eps); + + r_args[kParamIdx][ii] = param; + if (grad_scale_ptr) { + r_args[kGradIdx][ii] = grad_to_store; + } + r_args[kStateSumIdx][ii] = state_sum; + } +} + +template +struct FusedAdagradMathFunctor { + using opmath_t = at::opmath_type; + + C10_DEVICE __forceinline__ void operator()( + int64_t chunk_size, + FusedOptimizerTensorListMetadata<3>& tl, + const float* lr_ptr, + const double& lr, + const double& lr_decay, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + const double lr_double = lr_ptr ? *lr_ptr : lr; + + if (found_inf_ptr && *found_inf_ptr == 1) { + return; + } + + const auto corrected_lr = [&]() -> double { + auto* step_count = + reinterpret_cast(tl.state_steps_addresses[tensor_loc]); + const auto denom = 1 + (*step_count - 1) * lr_decay; + const auto corrected_lr = lr_double / denom; + return corrected_lr; + }(); + + scalar_t* args[3]; + scalar_t r_args[3][kILP]; + const auto n = tl.numel_for_tensor[tensor_loc] - + static_cast(chunk_idx * chunk_size); + + const bool all_aligned{ + init_args<3>(args, tl, chunk_idx, chunk_size, tensor_loc)}; + + if ((n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + load_store(r_args[kParamIdx], args[kParamIdx], 0, i_start); + load_store(r_args[kGradIdx], args[kGradIdx], 0, i_start); + load_store(r_args[kStateSumIdx], args[kStateSumIdx], 0, i_start); + + adagrad_math( + r_args, + corrected_lr, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr); + + load_store(args[kParamIdx], r_args[kParamIdx], i_start, 0); + load_store(args[kStateSumIdx], r_args[kStateSumIdx], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args<3>(r_args, args, i_start, chunk_size, n); + + adagrad_math( + r_args, + corrected_lr, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr); + +#pragma unroll + for (int i = 0; i < 3; i++) { + if (i != kGradIdx || grad_scale_ptr) { + store_args(args[i], r_args[i], i_start, chunk_size, n); + } + } + } + } + } +}; + +} // namespace + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..b8156fb622b440f051e4bb14a745ab0fe32aa93a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +void _fused_adam_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adam_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e7655b881383ca990b0aa3e84aac449cf3596bc7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +void _fused_adam_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adam_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..eb22658100a76df94ba2fbabe69d5882a8e168b6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh @@ -0,0 +1,205 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include + +namespace at::native { + +enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 }; + +namespace { + +constexpr uint8_t kParamIdx = 0; +constexpr uint8_t kGradIdx = 1; +constexpr uint8_t kExpAvgIdx = 2; +constexpr uint8_t kExpAvgSqIdx = 3; +constexpr uint8_t kMaxExpAvgSqIdx = 4; + +template < + typename scalar_type, + typename opmath_t, + int depth, + ADAM_MODE adam_mode, + bool amsgrad> +C10_DEVICE inline void adam_math( + scalar_type r_args[depth][kILP], + const double& lr, + const double& beta1, + const double& beta2, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr, + const opmath_t& bias_correction1, + const opmath_t& bias_correction2_sqrt) { + static_assert(depth == 4 || depth == 5); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + // Load values. + opmath_t param = static_cast(r_args[kParamIdx][ii]); + opmath_t grad = static_cast(r_args[kGradIdx][ii]); + if (grad_scale_ptr) { + grad /= (static_cast(*grad_scale_ptr)); + } + const opmath_t grad_to_store = grad; + if (maximize) { + grad = -grad; + } + opmath_t exp_avg = static_cast(r_args[kExpAvgIdx][ii]); + opmath_t exp_avg_sq = static_cast(r_args[kExpAvgSqIdx][ii]); + opmath_t max_exp_avg_sq; + if (amsgrad) { + max_exp_avg_sq = static_cast(r_args[kMaxExpAvgSqIdx][ii]); + } + // Update param, grad, 1st and 2nd order momentum. + if (weight_decay != 0) { + if constexpr (adam_mode == ADAM_MODE::ORIGINAL) { + grad += param * weight_decay; + } else if constexpr (adam_mode == ADAM_MODE::ADAMW) { + param -= lr * weight_decay * param; + } + } + // todo(crcrpar): use lerp + // ref: https://developer.nvidia.com/blog/lerp-faster-cuda/ + exp_avg = beta1 * exp_avg + (1 - beta1) * grad; + exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad; + const opmath_t step_size = lr / bias_correction1; + opmath_t denom; + if (amsgrad) { + max_exp_avg_sq = std::max(max_exp_avg_sq, exp_avg_sq); + denom = (std::sqrt(max_exp_avg_sq) / bias_correction2_sqrt) + eps; + } else { + denom = (std::sqrt(exp_avg_sq) / bias_correction2_sqrt) + eps; + } + param -= step_size * exp_avg / denom; + + // Store results. + r_args[kParamIdx][ii] = param; + if (grad_scale_ptr) { + r_args[kGradIdx][ii] = grad_to_store; + } + r_args[kExpAvgIdx][ii] = exp_avg; + r_args[kExpAvgSqIdx][ii] = exp_avg_sq; + if (amsgrad) { + r_args[kMaxExpAvgSqIdx][ii] = max_exp_avg_sq; + } + } +} + +// [note: Conditional Gradient Store when `optimizer.step` is called by +// GradScaler] When a user is training their model(s) with an FP16 AMP recipe, +// parameter updates are done via `grad_scaler.step(optimizer)` instead of +// `optimizer.step()`. For most optimizers, GradScaler unscales gradients on +// behalf of those optimizers. Also, before `.step`, it makes sure that all the +// gradients involved are finite, which incurs a device sync. On the other hand, +// fused optimizers set their member variable of `_step_supports_amp_scaling` to +// `True` in order to remove the device sync above. This means that fused +// optimizers have to have their CUDA kernels (a) unscale gradients and (b) skip +// parameter updates accordingly. To be functionally on par with `torch.optim` +// optimizers and `_multi_tensor` ones, the kernel below writes out gradients +// only when `grad_scale_ptr != nullptr. +template +struct FusedAdamMathFunctor { + static_assert( + depth == 4 || depth == 5, + "depth of 4 for Adam, depth of 5 for Adam with AMSGrad."); + using opmath_t = at::opmath_type; + C10_DEVICE __forceinline__ void operator()( + int64_t chunk_size, + FusedOptimizerTensorListMetadata& tl, + const float* lr_ptr, + const double& lr, + const double& beta1, + const double& beta2, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + const double lr_double = lr_ptr ? *lr_ptr : lr; + + if (found_inf_ptr && *found_inf_ptr == 1) { + return; + } + const auto [bias_correction1, bias_correction2_sqrt] = + [&]() -> std::pair { + auto* step_count = + reinterpret_cast(tl.state_steps_addresses[tensor_loc]); + const auto bias_correction1 = 1 - at::native::pow_(beta1, *step_count); + const auto bias_correction2 = 1 - at::native::pow_(beta2, *step_count); + const auto bias_correction2_sqrt = std::sqrt(bias_correction2); + return {bias_correction1, bias_correction2_sqrt}; + }(); + + scalar_type* args[depth]; + scalar_type r_args[depth][kILP]; + const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size; + + const bool all_aligned{ + init_args(args, tl, chunk_idx, chunk_size, tensor_loc)}; + if ((n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { +#pragma unroll + for (int i = 0; i < depth; i++) { + load_store(r_args[i], args[i], 0, i_start); + } + adam_math( + r_args, + lr_double, + beta1, + beta2, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr, + bias_correction1, + bias_correction2_sqrt); +#pragma unroll + for (int i = 0; i < depth; i++) { + if (i != kGradIdx || grad_scale_ptr) { + load_store(args[i], r_args[i], i_start, 0); + } + } + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); + adam_math( + r_args, + lr_double, + beta1, + beta2, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr, + bias_correction1, + bias_correction2_sqrt); +#pragma unroll + for (int i = 0; i < depth; i++) { + if (i != kGradIdx || grad_scale_ptr) { + store_args(args[i], r_args[i], i_start, chunk_size, n); + } + } + } + } + } +}; +} // namespace + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..3b4a105428d4a8f11472ab0e9fbcd319744933d0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +void _fused_adamw_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adamw_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4a94840cb7c15f5449b1285a4dd1f65a60ded2bf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +void _fused_adamw_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adamw_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/im2col.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/im2col.cuh new file mode 100644 index 0000000000000000000000000000000000000000..78a9f7d0cc4c939376c69b53056b3be69c681c38 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/im2col.cuh @@ -0,0 +1,341 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#include + +namespace at::native { + +using namespace at::cuda::detail; + +// Kernel for fast unfold+copy +// (borrowed from Caffe: +// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu) +// CUDA_NUM_THREADS = 1024 + +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void im2col_kernel( + const int64_t n, + const dt* data_im, + const int64_t height, + const int64_t width, + const int64_t kernel_height, + const int64_t kernel_width, + const int64_t pad_height, + const int64_t pad_width, + const int64_t stride_height, + const int64_t stride_width, + const int64_t dilation_height, + const int64_t dilation_width, + const int64_t height_col, + const int64_t width_col, + dt* data_col) { + CUDA_KERNEL_LOOP_TYPE(index, n, int64_t) { + int64_t w_out = index % width_col; + + int64_t idx = index / width_col; + + int64_t h_out = idx % height_col; + int64_t channel_in = idx / height_col; + int64_t channel_out = channel_in * kernel_height * kernel_width; + int64_t h_in = h_out * stride_height - pad_height; + int64_t w_in = w_out * stride_width - pad_width; + + dt* col = data_col + (channel_out * height_col + h_out) * width_col + w_out; + const dt* im = data_im + (channel_in * height + h_in) * width + w_in; + + for (int64_t i = 0; i < kernel_height; ++i) { + for (int64_t j = 0; j < kernel_width; ++j) { + int64_t h = h_in + i * dilation_height; + int64_t w = w_in + j * dilation_width; + *col = (h >= 0 && w >= 0 && h < height && w < width) + ? im[i * dilation_height * width + j * dilation_width] + : static_cast
(0); + col += height_col * width_col; + } + } + } +} + +template +void im2col( + cudaStream_t stream, + const dt* data_im, + const int64_t channels, + const int64_t height, + const int64_t width, + const int64_t height_col, + const int64_t width_col, + const int64_t kernel_height, + const int64_t kernel_width, + const int64_t pad_height, + const int64_t pad_width, + const int64_t stride_height, + const int64_t stride_width, + const int64_t dilation_height, + const int64_t dilation_width, + dt* data_col) { + // We are going to launch channels * height_col * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int64_t num_kernels = channels * height_col * width_col; + // Launch CUDA_NUM_THREADS = 1024 + im2col_kernel<<>>( + num_kernels, + data_im, + height, + width, + kernel_height, + kernel_width, + pad_height, + pad_width, + stride_height, + stride_width, + dilation_height, + dilation_width, + height_col, + width_col, + data_col); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +__forceinline__ __device__ void col2im_device( + const int64_t index, + const dt* data_col, + const int64_t height, + const int64_t width, + const int64_t kernel_h, + const int64_t kernel_w, + const int64_t pad_height, + const int64_t pad_width, + const int64_t stride_height, + const int64_t stride_width, + const int64_t dilation_height, + const int64_t dilation_width, + const int64_t height_col, + const int64_t width_col, + dt* data_im) { + accT val = static_cast(0); + const int64_t w_im = index % width + pad_width; + const int64_t h_im = (index / width) % height + pad_height; + const int64_t c_im = index / (width * height); + int64_t kernel_extent_w = (kernel_w - 1) * dilation_width + 1; + int64_t kernel_extent_h = (kernel_h - 1) * dilation_height + 1; + // compute the start and end of the output + const int64_t w_col_start = (w_im < kernel_extent_w) + ? 0 + : (w_im - kernel_extent_w) / stride_width + 1; + const int64_t w_col_end = ::min(w_im / stride_width + 1, width_col); + const int64_t h_col_start = (h_im < kernel_extent_h) + ? 0 + : (h_im - kernel_extent_h) / stride_height + 1; + const int64_t h_col_end = ::min(h_im / stride_height + 1, height_col); + + // TODO: use LCM of stride and dilation to avoid unnecessary loops + for (int64_t h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (int64_t w_col = w_col_start; w_col < w_col_end; w_col += 1) { + int64_t h_k = (h_im - h_col * stride_height); + int64_t w_k = (w_im - w_col * stride_width); + if (h_k % dilation_height == 0 && w_k % dilation_width == 0) { + h_k /= dilation_height; + w_k /= dilation_width; + int64_t data_col_index = + (((c_im * kernel_h + h_k) * kernel_w + w_k) * height_col + + h_col) * + width_col + + w_col; + val += data_col[data_col_index]; + } + } + } + data_im[index] = static_cast
(val); +} + +template +C10_LAUNCH_BOUNDS_1(512) +__global__ void col2im_kernel( + const int64_t n, + const dt* data_col, + const int64_t height, + const int64_t width, + const int64_t kernel_h, + const int64_t kernel_w, + const int64_t pad_height, + const int64_t pad_width, + const int64_t stride_height, + const int64_t stride_width, + const int64_t dilation_height, + const int64_t dilation_width, + const int64_t height_col, + const int64_t width_col, + dt* data_im) { + CUDA_KERNEL_LOOP(index, n) { + col2im_device( + index, + data_col, + height, + width, + kernel_h, + kernel_w, + pad_height, + pad_width, + stride_height, + stride_width, + dilation_height, + dilation_width, + height_col, + width_col, + data_im); + } +} + +template +void col2im( + cudaStream_t stream, + const dt* data_col, + const int64_t channels, + const int64_t height, + const int64_t width, + const int64_t height_col, + const int64_t width_col, + const int64_t patch_height, + const int64_t patch_width, + const int64_t pad_height, + const int64_t pad_width, + const int64_t stride_height, + const int64_t stride_width, + const int64_t dilation_height, + const int64_t dilation_width, + dt* data_im) { + int64_t num_kernels = channels * height * width; + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // CUDA_NUM_THREADS = 1024 + col2im_kernel + <<>>( + num_kernels, + data_col, + height, + width, + patch_height, + patch_width, + pad_height, + pad_width, + stride_height, + stride_width, + dilation_height, + dilation_width, + height_col, + width_col, + data_im); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +C10_LAUNCH_BOUNDS_1(512) +__global__ void col2im_batched_kernel( + const int64_t n, + const dt* data_col, + const int64_t col_batch_stride, + const int64_t nbatch, + const int64_t height, + const int64_t width, + const int64_t kernel_h, + const int64_t kernel_w, + const int64_t pad_height, + const int64_t pad_width, + const int64_t stride_height, + const int64_t stride_width, + const int64_t dilation_height, + const int64_t dilation_width, + const int64_t height_col, + const int64_t width_col, + dt* data_im, + const int64_t im_batch_stride) { + using accT = at::acc_type; + const auto im_numel = n * nbatch; + + CUDA_KERNEL_LOOP_TYPE(index, im_numel, int64_t) { + const auto ibatch = index / n; + const auto slice_index = index % n; + + col2im_device( + slice_index, + data_col + ibatch * col_batch_stride, + height, + width, + kernel_h, + kernel_w, + pad_height, + pad_width, + stride_height, + stride_width, + dilation_height, + dilation_width, + height_col, + width_col, + data_im + ibatch * im_batch_stride); + } +} + +template +void col2im_batched( + cudaStream_t stream, + const dt* data_col, + const int64_t col_batch_stride, + const int64_t nbatch, + const int64_t channels, + const int64_t height, + const int64_t width, + const int64_t height_col, + const int64_t width_col, + const int64_t patch_height, + const int64_t patch_width, + const int64_t pad_height, + const int64_t pad_width, + const int64_t stride_height, + const int64_t stride_width, + const int64_t dilation_height, + const int64_t dilation_width, + dt* data_im, + const int64_t im_batch_stride) { + const int64_t num_kernels = channels * height * width; + const int64_t output_numel = nbatch * num_kernels; + if (output_numel == 0) { + return; // No work to do + } + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // CUDA_NUM_THREADS = 1024 + col2im_batched_kernel<<>>( + num_kernels, + data_col, + col_batch_stride, + nbatch, + height, + width, + patch_height, + patch_width, + pad_height, + pad_width, + stride_height, + stride_width, + dilation_height, + dilation_width, + height_col, + width_col, + data_im, + im_batch_stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/jit_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/jit_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..48aca14167fd999df98adbe6805bad3ea40cf63c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/jit_utils.h @@ -0,0 +1,254 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include + +namespace at::cuda::jit { + +enum class BinaryFuncVariant {NoScalar, RhsScalar, LhsScalar}; + +struct NvrtcFunction { + CUmodule module = CUmodule(); + CUfunction function = nullptr; +}; + +struct KernelDescriptor { + std::string name; + std::string f; + c10::ScalarType f_inputs_type; + c10::ScalarType result_type; + c10::SmallVector extra_args_types; + int nInputs, nOutputs; +}; + +// Helper function to return a vector +// corresponding to the type of the arguments in parameter pack. +template +c10::SmallVector get_extra_args_types() { + return {c10::CppTypeToScalarType::value ...}; +} + +template < + typename result_type, + typename f_inputs_type, + typename... ExtraArgs> +KernelDescriptor make_kernel_descriptor( + std::string name, + std::string f, + int nInputs, + int nOutputs) { + KernelDescriptor ret; + ret.name = std::move(name); + ret.f = std::move(f); + ret.f_inputs_type = c10::CppTypeToScalarType::value; + ret.result_type = c10::CppTypeToScalarType::value; + ret.extra_args_types = get_extra_args_types(); + ret.nInputs = nInputs; + ret.nOutputs = nOutputs; + return ret; +} + +inline int can_vectorize_up_to(size_t default_alignment, void *pointer) { + auto ip = reinterpret_cast(pointer); +#ifdef USE_ROCM + if ((default_alignment == 1) && (ip % (16 * default_alignment) == 0)) { + return 16; + } + if ((default_alignment <= 2) && (ip % (8 * default_alignment) == 0)) { + return 8; + } +#else + if (ip % (8 * default_alignment) == 0) { + return 8; + } +#endif + if (ip % (4 * default_alignment) == 0) { + return 4; + } + if (ip % (2 * default_alignment) == 0) { + return 2; + } + return 1; +} + +inline int can_vectorize_up_to(const KernelDescriptor &desc, c10::ArrayRef pointers) { + TORCH_INTERNAL_ASSERT(desc.nOutputs == 1); + TORCH_INTERNAL_ASSERT(static_cast(pointers.size()) == 1 + desc.nInputs); + + // Deals with output + auto result_size = c10::scalarTypeToTypeMeta(desc.result_type).itemsize(); + auto result = can_vectorize_up_to(result_size, pointers[0]); + + // Incorporates input(s) + auto input_size = c10::scalarTypeToTypeMeta(desc.f_inputs_type).itemsize(); + for (auto i : c10::irange(1, pointers.size())) { + result = std::min(result, can_vectorize_up_to(input_size, pointers[i])); + } + + return result; +} + +//FIXME - this are defined in Loops.cuh, but including Loops.cuh here would lead to circular includes Loops.cuh -> CUDALoops.cuh -> jit_utils.h -> Loops.cuh +#ifdef USE_ROCM +#define JIT_THREAD_WORK_SIZE 4 +#else +#define JIT_THREAD_WORK_SIZE 8 +#endif + +int calc_io_size( + const int nInputs, + const int nOutputs, + const c10::ScalarType& inputs_type, + const c10::ScalarType& result_type); + +int calc_thread_work_size( + const int nInputs, + const int nOutputs, + const c10::ScalarType& inputs_type, + const c10::ScalarType& result_type); + +std::string generate_code( + int nInputs, + int nOutputs, + const std::string& func, + const std::string& name, + const std::string& f_inputs_type, + const std::string& compute_type, + const std::string& result_type, + bool contiguous, + bool dynamic_casting, + BinaryFuncVariant scalar_pos, + c10::SmallVector& extra_args_typenames, + int thread_work_size=JIT_THREAD_WORK_SIZE, + bool vectorized=false, + int vec_size=0, + bool return_by_ref=false); + +std::string generate_code( + const KernelDescriptor &desc, + bool contiguous, + bool dynamic_casting, + BinaryFuncVariant scalar_pos, + int thread_work_size=JIT_THREAD_WORK_SIZE, + bool vectorized=false, + int vec_size=0, + bool return_by_ref=false); + +std::string generate_reduction_code( + int nOutputs, + const std::string& func, + const std::string& name, + const int vt0, + const std::string& f_inputs_type, + const std::string& reduction_accum_type, + const std::string& result_type, + bool contiguous, + bool vectorized, + int vec_size, + int max_threads_codegen); + +std::string generate_reduction_code( + const KernelDescriptor &desc, + const int vt0, + bool contiguous, + bool vectorized, + int vec_size, + int max_threads_codegen); + +NvrtcFunction jit_pwise_function( + const std::string& code, + const std::string& kernel_name); + +void launch_jitted_pwise_function( + NvrtcFunction function, + const void* args[], + const dim3 nBlocks, + const dim3 kBlockSize, + const int smem=0); + +template +struct delayed_false : std::false_type { +}; + +// Defines type names +// NOTE: General case is instantiated only for invalid types. +// All the valid types have specialization using the TYPE_NAME_FN +// macro below. +template +inline std::string typeName() { + // we can't use static_assert(false) directly as the + // program will be not compiled even if the template is not + // instantiated, so we use `delayed_false` + // to make sure compiler doesn't eagerly raise + // fail this assertion. + static_assert(delayed_false::value, "invalid type for jiterator"); + return "void"; +} + +#define TYPE_NAME_FN(ctype, name) \ +template <> inline std::string typeName(){ \ + return std::string(#ctype); \ +} + +AT_FORALL_SCALAR_TYPES(TYPE_NAME_FN) +#undef TYPE_NAME_FN +// JIT uses std::complex directly, because nvRTC compile programs +// with -default-device, so there is no such issue like: +// "std::sin(complex) is __host__ only" +template <> inline std::string typeName(){ + return "bool"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} +template <> inline std::string typeName(){ + return "at::Half"; +} +template <> inline std::string typeName(){ + return "at::BFloat16"; +} +template <> inline std::string typeName(){ + return "at::Float8_e5m2"; +} +template <> inline std::string typeName(){ + return "at::Float8_e4m3fn"; +} +template <> inline std::string typeName() { + return "at::Float8_e5m2fnuz"; +} +template <> inline std::string typeName() { + return "at::Float8_e4m3fnuz"; +} +template <> inline std::string typeName() { + // TODO(#146647): Can the code here be made generic for any scalartype? + return "at::Float8_e8m0fnu"; +} + +#define TYPE_NAME_CASE(ctype, scalartype) \ + case ScalarType::scalartype: return typeName(); +inline std::string typeName(ScalarType t) { + switch (t) { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(TYPE_NAME_CASE) + default: + TORCH_CHECK(false, "invalid type for jiterator"); + } +} +#undef TYPE_NAME_CASE + +TORCH_CUDA_CPP_API void initializeCudaContext(); + +} // namespace at::cuda::jit + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c86a786493ad6dbf4fd45dc1f14dab4f94fcb161 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh @@ -0,0 +1,689 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +namespace at::cuda { +//windows doesn't like large string literals, so split in two +const std::string reduction_template_0 = R"ESCAPE( + #define C10_HOST_DEVICE __host__ __device__ + #define C10_DEVICE __device__ + #if defined(__clang__) && defined(__HIP__) + #ifndef __forceinline__ + #define __forceinline__ inline __attribute__((always_inline)) + #endif + // until ROCm support for kernel asserts is restored + #define assert(expr) (static_cast(0)) + #endif + + template + __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) + { + #if defined(__clang__) && defined(__HIP__) + return __shfl_down(value, delta, width); + #else + return __shfl_down_sync(mask, value, delta, width); + #endif + } + + + #if ${complex} + template + __device__ __forceinline__ std::complex WARP_SHFL_DOWN(std::complex value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) + { + return std::complex( + #if defined(__clang__) && defined(__HIP__) + __shfl_down(value.real(), delta, width), + __shfl_down(value.imag(), delta, width)); + #else + __shfl_down_sync(mask, value.real(), delta, width), + __shfl_down_sync(mask, value.imag(), delta, width)); + #endif + } + #endif + + // aligned vector generates vectorized load/store on CUDA + template + struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { + scalar_t val[vec_size]; + }; + + + C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { + // get GCD of num and denom using Euclid's algorithm. + // Can replace this with std::gcd if we ever support c++17. + size_t a = denominator; + size_t b = numerator; + while (b != 0) { + a %= b; + // swap(a,b) + size_t tmp = a; + a = b; + b = tmp; + } + + // a is now the GCD + numerator /= a; + denominator /= a; + } + + + + + struct ReduceConfig { + //has to match host-side ReduceConfig in the eager code + static constexpr int BLOCK_X = 0; + static constexpr int BLOCK_Y = 1; + static constexpr int CTA = 2; + + static constexpr int input_vec_size = 4; + int element_size_bytes; + int num_inputs; + int num_outputs; + int step_input = 1; + int step_output = 1; + int ctas_per_output = 1; + int input_mult[3] = {0, 0, 0}; + int output_mult[2] = {0, 0}; + + int block_width; + int block_height; + int num_threads; + + bool vectorize_input = false; + int output_vec_size = 1; + + C10_HOST_DEVICE bool should_block_x_reduce() const { + return input_mult[BLOCK_X] != 0; + } + + C10_HOST_DEVICE bool should_block_y_reduce() const { + return input_mult[BLOCK_Y] != 0; + } + + C10_HOST_DEVICE bool should_global_reduce() const { + return input_mult[CTA] != 0; + } + + C10_DEVICE bool should_store(int output_idx) const { + return output_idx < num_outputs && + (!should_block_x_reduce() || threadIdx.x == 0) && + (!should_block_y_reduce() || threadIdx.y == 0); + } + + C10_DEVICE bool should_reduce_tail() const { + return (!should_block_y_reduce() || threadIdx.y == 0) && + (!should_global_reduce() || blockIdx.y == 0); + } + + C10_HOST_DEVICE int input_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta2 = blockIdx.y; + return (lane * input_mult[BLOCK_X] + + warp * input_mult[BLOCK_Y] + + cta2 * input_mult[CTA]); + } + + template + C10_HOST_DEVICE int output_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta1 = blockIdx.x; + return (lane * output_mult[BLOCK_X] + + warp * output_mult[BLOCK_Y] + + cta1 * step_output) * output_vec_size; + } + + C10_DEVICE int shared_memory_offset(int offset) const { + return threadIdx.x + (threadIdx.y + offset) * blockDim.x; + } + + C10_DEVICE int staging_memory_offset(int cta2) const { + int offset = cta2 + blockIdx.x * gridDim.y; + if (!should_block_x_reduce()) { + offset = threadIdx.x + offset * blockDim.x; + } + return offset; + } + + + }; + + +//TODO this will need to be different for more generic reduction functions +namespace reducer { + + using scalar_t = ${scalar_type}; + using arg_t = ${reduction_accum_type}; + using out_scalar_t = ${result_type}; + + + inline __device__ ${functor} + + inline __device__ out_scalar_t project(arg_t arg) { + return (out_scalar_t) arg; + } + + inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) { + return WARP_SHFL_DOWN(arg, offset); + } + + inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) { + return acc; + } + + // wrap a normal reduction that ignores the index + inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) { + return combine(acc, val); + } +} + + +struct ReduceJitOp { + using scalar_t = ${scalar_type}; + using arg_t = ${reduction_accum_type}; + using out_scalar_t = ${result_type}; + + using InputCalculator = OffsetCalculator<1>; + using OutputCalculator = OffsetCalculator<2>; + +// static constexpr bool can_accumulate_in_output = +// std::is_convertible_v +// && std::is_convertible_v; + + static constexpr int input_vec_size = ReduceConfig::input_vec_size; + + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + + C10_DEVICE void run() const { + extern __shared__ char shared_memory[]; + uint32_t output_idx = config.output_idx<${output_vec_size}>(); + uint32_t input_idx = config.input_idx(); + auto base_offsets1 = output_calc.get(output_idx)[1]; + + using arg_vec_t = Array; + arg_vec_t value; + + if (output_idx < config.num_outputs && input_idx < config.num_inputs) { + const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1); + + value = thread_reduce<${output_vec_size}>(input_slice); + } + + if (config.should_block_y_reduce()) { + value = block_y_reduce<${output_vec_size}>(value, shared_memory); + } + if (config.should_block_x_reduce()) { + value = block_x_reduce<${output_vec_size}>(value, shared_memory); + } + + using out_ptr_vec_t = Array; + using offset_vec_t = Array; + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + arg_vec_t* acc = nullptr; + if (acc_buf != nullptr) { + size_t numerator = sizeof(arg_t); + size_t denominator = sizeof(out_scalar_t); + reduce_fraction(numerator, denominator); + acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator)); + } + + if (config.should_global_reduce()) { + value = global_reduce<${output_vec_size}>(value, acc, shared_memory); + } else if (config.should_store(output_idx)) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + value[i] = reducer::translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output<${output_vec_size}>(out, value); + } + if (final_output) { + set_results_to_output<${output_vec_size}>(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + value[i] = reducer::combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output<${output_vec_size}>(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + template + C10_DEVICE Array thread_reduce(const scalar_t* data) const { + if (config.vectorize_input) { + assert(output_vec_size == 1); + // reduce at the header of input_slice where memory is not aligned, + // so that thread_reduce will have an aligned memory to work on. + return {input_vectorized_thread_reduce_impl(data)}; + } else { + uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t); + bool is_contiguous = (input_calc.dims == 1 && element_stride == 1); + if (is_contiguous) { + return thread_reduce_impl(data, [](uint32_t idx) { return idx; }); + } else if (input_calc.dims == 1) { + return thread_reduce_impl(data, [&](uint32_t idx) { return idx * element_stride; }); + } else { + return thread_reduce_impl(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); }); + } + } + } + + C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { + uint32_t end = config.num_inputs; + + // Handle the head of input slice where data is not aligned + arg_t value = ident; + constexpr int align_bytes = alignof(aligned_vector); + constexpr int align_elements = align_bytes / sizeof(scalar_t); + int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t); + if (shift > 0) { + data -= shift; + end += shift; + if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){ + value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift); + } + end -= align_elements; + data += align_elements; + shift = align_elements - shift; + } + + // Do the vectorized reduction + using load_t = aligned_vector; + + uint32_t idx = config.input_idx(); + const uint32_t stride = config.step_input; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_t value_list[input_vec_size]; + value_list[0] = value; + + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[i] = ident; + } + + scalar_t values[input_vec_size]; + + load_t *values_vector = reinterpret_cast(&values[0]); + + while (idx * input_vec_size + input_vec_size - 1 < end) { + *values_vector = reinterpret_cast(data)[idx]; + #pragma unroll + for (uint32_t i = 0; i < input_vec_size; i++) { + value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i); + } + idx += stride; + } + + // tail + uint32_t tail_start = end - end % input_vec_size; + if (config.should_reduce_tail()) { + int idx = tail_start + threadIdx.x; + if (idx < end) { + value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift); + } + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[0] = reducer::combine(value_list[0], value_list[i]); + } + return value_list[0]; + } + + template + C10_DEVICE Array thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const { + uint32_t idx = config.input_idx(); + const uint32_t end = config.num_inputs; + const uint32_t stride = config.step_input; + const int vt0=${vt0}; + + using arg_vec_t = Array; + using load_t = aligned_vector; + const load_t* data = reinterpret_cast(data_); + + // Multiple accumulators to remove dependency between unrolled loops. + arg_vec_t value_list[vt0]; + + #pragma unroll + for (int i = 0; i < vt0; i++) { + #pragma unroll + for (int j = 0; j < output_vec_size; j++) { + value_list[i][j] = ident; + } + } + + load_t values[vt0]; + + while (idx + (vt0 - 1) * stride < end) { + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + values[i] = data[calc(idx + i * stride) / output_vec_size]; + } + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride); + } + } + idx += stride * vt0; + } + + // tail + int idx_ = idx; + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + values[i] = data[calc(idx) / output_vec_size]; + idx += stride; + } + idx = idx_; + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx); + } + idx += stride; + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < vt0; i++) { + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]); + } + } + return value_list[0]; + } + template + C10_DEVICE Array block_x_reduce(Array value, char* shared_memory) const { + using args_vec_t = Array; + int dim_x = blockDim.x; + args_vec_t* shared = (args_vec_t*)shared_memory; + if (dim_x > warpSize) { + int address_base = threadIdx.x + threadIdx.y*blockDim.x; + shared[address_base] = value; + for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) { + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { + args_vec_t other = shared[address_base + offset]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], other[i]); + } + shared[address_base] = value; + } + } + dim_x = warpSize; + } + + __syncthreads(); + + #if defined(USE_ROCM) || defined(FBCODE_CAFFE2) + for (int offset = 1; offset < dim_x; offset <<= 1) { + #else + for (int offset = dim_x >> 1; offset > 0; offset >>= 1) { + #endif + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + arg_t other = reducer::warp_shfl_down(value[i], offset); + value[i] = reducer::combine(value[i], other); + } + } + return value; + } + + template + C10_DEVICE Array block_y_reduce(Array value, char* shared_memory) const { + using args_vec_t = Array; + args_vec_t* shared = (args_vec_t*)shared_memory; + shared[config.shared_memory_offset(0)] = value; + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + args_vec_t other = shared[config.shared_memory_offset(offset)]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], other[i]); + } + shared[config.shared_memory_offset(0)] = value; + } + } + return value; + } + )ESCAPE"; + + const std::string reduction_template_1 = R"ESCAPE( + + C10_DEVICE bool mark_block_finished() const { + __shared__ bool is_last_block_done_shared; + + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1); + } + + __syncthreads(); + + return is_last_block_done_shared; + } + + template + C10_DEVICE Array accumulate_in_output( + Array out, + Array value + ) const { + Array ret; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + ret[i] = reducer::combine(*(out[i]), value[i]); + } + return ret; + } + + + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value + ) const { + assert(!final_output); + return (out_scalar_t)value; + } + + template + C10_DEVICE void set_results(const T x, const uint32_t base_offset) const { + assert(noutputs == 1); + auto res = (out_scalar_t*)((char*)dst[0] + base_offset); + *res = x; + } + +//TODO - multi-output reduction - we won't be able to use thrust::pair +//just explicitly specify typed output reads/writes +//Currently implemented for max of two outputs +// template +// C10_DEVICE void set_results(const thrust::pair x, const index_t base_offset) const { +// if (noutputs >= 1) { +// auto res0 = (T1*)((char*)dst[0] + base_offset); +// *res0 = x.first; +// } +// if (noutputs >= 2) { +// // base offset is computed assuming element size being sizeof(T1), so we need to make a +// // correction to obtain the correct base offset +// auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2)); +// *res1 = x.second; +// } +// } + + template + C10_DEVICE void set_results_to_output(Array value, Array base_offset) const { + assert(final_output); + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + set_results(reducer::project(value[i]), base_offset[i]); + } + } + + template + C10_DEVICE Array global_reduce(Array value, Array *acc, char* shared_memory) const { + using arg_vec_t = Array; + using out_ptr_vec_t = Array; + using offset_vec_t = Array; + + arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf; + uint32_t output_idx = config.output_idx(); + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + bool should_store = config.should_store(output_idx); + if (should_store) { + uint32_t offset = config.staging_memory_offset(blockIdx.y); + reduce_buffer[offset] = value; + } + + __threadfence(); // make sure writes are globally visible + __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done + bool is_last_block_done = mark_block_finished(); + + if (is_last_block_done) { + __threadfence(); //complete acquire pattern + value = ident; + if (config.should_block_x_reduce()) { + uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x; + uint32_t step = blockDim.x * blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + uint32_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], next[i]); + } + } + } else { + uint32_t input_offset = threadIdx.y; + uint32_t step = blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + uint32_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], next[i]); + } + } + } + value = block_y_reduce(value, shared_memory); + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (should_store) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + return value; + } +}; + +extern "C" +__launch_bounds__(${max_threads_lb}, 4) +__global__ void reduction_${name}_kernel(ReduceJitOp r){ + r.run(); +} +)ESCAPE"; + +const std::string reduction_template = reduction_template_0 + reduction_template_1; + + +const std::string &get_reduction_template() { + return reduction_template; +} + +} // namespace at::cuda + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/thread_constants.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/thread_constants.h new file mode 100644 index 0000000000000000000000000000000000000000..fc0fb8ca6043142a5c4c34b83ca1da4208878bc0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/thread_constants.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +// Marks a lambda as executable on both the host and device. The __host__ +// attribute is important so that we can access static type information from +// the host, even if the function is typically only executed on the device. +#ifndef GPU_LAMBDA +#define GPU_LAMBDA __host__ __device__ +#endif + +#if defined(USE_ROCM) +constexpr int num_threads() { + return 256; +} + +constexpr int thread_work_size() { return 4; } +#else +constexpr uint32_t num_threads() { + return C10_WARP_SIZE * 4; +} + +constexpr int thread_work_size() { return 8; } +#endif + +constexpr int block_work_size() { return thread_work_size() * num_threads(); } + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/vol2col.cuh b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/vol2col.cuh new file mode 100644 index 0000000000000000000000000000000000000000..30cc898d11e7d836144b61e6e12a308cec60673d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/cuda/vol2col.cuh @@ -0,0 +1,267 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include + +namespace at::native { + +using namespace at::cuda::detail; + +// Kernel for fast unfold+copy on volumes +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void vol2col_kernel( + const int64_t n, + const T* data_vol, + const int depth, + const int height, + const int width, + const int ksize_t, + const int ksize_h, + const int ksize_w, + const int pad_t, + const int pad_h, + const int pad_w, + const int stride_t, + const int stride_h, + const int stride_w, + const int dilation_t, + const int dilation_h, + const int dilation_w, + const int depth_col, + const int height_col, + const int width_col, + T* data_col) { + CUDA_KERNEL_LOOP_TYPE(index, n, int64_t) { + auto w_out = index % width_col; + index /= width_col; + auto h_out = index % height_col; + index /= height_col; + auto t_out = index % depth_col; + auto channel_in = index / depth_col; + auto channel_out = channel_in * ksize_t * ksize_h * ksize_w; + auto t_in = t_out * stride_t - pad_t; + auto h_in = h_out * stride_h - pad_h; + auto w_in = w_out * stride_w - pad_w; + data_col += + ((channel_out * depth_col + t_out) * height_col + h_out) * width_col + + w_out; + data_vol += ((channel_in * depth + t_in) * height + h_in) * width + w_in; + for (int i = 0; i < ksize_t; ++i) { + for (int j = 0; j < ksize_h; ++j) { + for (int k = 0; k < ksize_w; ++k) { + auto t = t_in + i * dilation_t; + auto h = h_in + j * dilation_h; + auto w = w_in + k * dilation_w; + *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height && + w < width) + ? data_vol + [i * dilation_t * height * width + j * dilation_h * width + + k * dilation_w] + : static_cast(0); + data_col += depth_col * height_col * width_col; + } + } + } + } +} + +template +void vol2col( + cudaStream_t stream, + const T* data_vol, + const int channels, + const int depth, + const int height, + const int width, + const int depth_col, + const int height_col, + const int width_col, + const int ksize_t, + const int ksize_h, + const int ksize_w, + const int pad_t, + const int pad_h, + const int pad_w, + const int stride_t, + const int stride_h, + const int stride_w, + const int dilation_t, + const int dilation_h, + const int dilation_w, + T* data_col) { + // We are going to launch channels * depth_col * height_col * width_col + // kernels, each kernel responsible for copying a single-channel grid. + // We cast an operand to int64 so that the product will not overflow + const auto num_kernels = static_cast(channels) * depth_col * height_col * width_col; + // Launch + vol2col_kernel<<>>( + num_kernels, + data_vol, + depth, + height, + width, + ksize_t, + ksize_h, + ksize_w, + pad_t, + pad_h, + pad_w, + stride_t, + stride_h, + stride_w, + dilation_t, + dilation_h, + dilation_w, + depth_col, + height_col, + width_col, + data_col); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +__global__ void vol2im_kernel( + const int64_t n, + const T* data_col, + const unsigned depth, + const unsigned height, + const unsigned width, + const unsigned channels, + const unsigned kernel_t, + const unsigned kernel_h, + const unsigned kernel_w, + const unsigned pad_t, + const unsigned pad_h, + const unsigned pad_w, + const unsigned stride_t, + const unsigned stride_h, + const unsigned stride_w, + const unsigned dilation_t, + const unsigned dilation_h, + const unsigned dilation_w, + const unsigned depth_col, + const unsigned height_col, + const unsigned width_col, + T* data_vol) { + CUDA_KERNEL_LOOP(index, n) { + accT val = static_cast(0); + const auto w_im = index % width + pad_w; + const auto h_im = (index / width) % height + pad_h; + const auto t_im = (index / width / height) % depth + pad_t; + const auto c_im = index / (width * height * depth); + auto kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + auto kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + auto kernel_extent_t = (kernel_t - 1) * dilation_t + 1; + // compute the start and end of the output + const auto w_col_start = + (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; + const auto w_col_end = std::min(w_im / stride_w + 1, width_col); + const auto h_col_start = + (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; + const auto h_col_end = std::min(h_im / stride_h + 1, height_col); + const auto t_col_start = + (t_im < kernel_extent_t) ? 0 : (t_im - kernel_extent_t) / stride_t + 1; + const auto t_col_end = std::min(t_im / stride_t + 1, depth_col); + // TODO: use LCM of stride and dilation to avoid unnecessary loops + for (unsigned t_col = t_col_start; t_col < t_col_end; t_col += 1) { + for (unsigned h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (unsigned w_col = w_col_start; w_col < w_col_end; w_col += 1) { + uint64_t t_k = (t_im - t_col * stride_t); + uint64_t h_k = (h_im - h_col * stride_h); + uint64_t w_k = (w_im - w_col * stride_w); + if (t_k % dilation_t == 0 && h_k % dilation_h == 0 && + w_k % dilation_w == 0) { + t_k /= dilation_t; + h_k /= dilation_h; + w_k /= dilation_w; + const int64_t idx_k = + ((c_im * kernel_t + t_k) * kernel_h + h_k) * kernel_w + w_k; + const int64_t data_col_index = + ((idx_k * depth_col + t_col) * + height_col + h_col) * + width_col + w_col; + val += data_col[data_col_index]; + } + } + } + } + data_vol[index] = static_cast(val); + } +} + +template +void col2vol( + cudaStream_t stream, + const T* data_col, + const int64_t channels, + const int64_t depth, + const int64_t height, + const int64_t width, + const int64_t output_depth, + const int64_t output_height, + const int64_t output_width, + const int64_t patch_t, + const int64_t patch_h, + const int64_t patch_w, + const int64_t pad_t, + const int64_t pad_h, + const int64_t pad_w, + const int64_t stride_t, + const int64_t stride_h, + const int64_t stride_w, + const int64_t dilation_t, + const int64_t dilation_h, + const int64_t dilation_w, + T* data_vol) { + const auto num_kernels = channels * depth * height * width; + + auto check_fits_in_unsigned = + [](int64_t val, const char * name) { + constexpr auto umax = std::numeric_limits::max(); + TORCH_CHECK(val >= 0 && val <= umax, + name, " must fit in a 32-bit unsigned value"); + }; + check_fits_in_unsigned(num_kernels, "input size"); + check_fits_in_unsigned( + channels * patch_t * patch_h * patch_w, "channels x kernel size"); + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + vol2im_kernel + <<>>( + num_kernels, + data_col, + depth, + height, + width, + channels, + patch_t, + patch_h, + patch_w, + pad_t, + pad_h, + pad_w, + stride_t, + stride_h, + stride_w, + dilation_t, + dilation_h, + dilation_w, + output_depth, + output_height, + output_width, + data_vol); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/group_norm.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/group_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..19ea16e8dfff68d95b7e8f4644b11f985a217134 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/group_norm.h @@ -0,0 +1,47 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at { +class Tensor; + +namespace native { + +using forward_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + int64_t /* N */, + int64_t /* C */, + int64_t /* HxW */, + int64_t /* group */, + double /* eps */, + Tensor& /* Y */, + Tensor& /* mean */, + Tensor& /* rstd */); + +using backward_fn = void (*)( + const Tensor& /* dY */, + const Tensor& /* X */, + const Tensor& /* mean */, + const Tensor& /* rstd */, + const Tensor& /* gamma */, + int64_t /* N */, + int64_t /* C */, + int64_t /* HxW */, + int64_t /* group */, + Tensor& /* dX */, + Tensor& /* dgamma */, + Tensor& /* dbeta */); + +DECLARE_DISPATCH(forward_fn, GroupNormKernel) +DECLARE_DISPATCH(backward_fn, GroupNormBackwardKernel) + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_collection.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_collection.h new file mode 100644 index 0000000000000000000000000000000000000000..af45c9e027c88eb4f3d4718d3e9433aefa343893 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_collection.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { +void bgemm_kernel_bf16bf16bf16_256_256x256x32_32x32_4x4_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v4(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x256x32_32x32_4x4_16x16x1_16x16x1_1x16x1x16_4_Intrawave_v4(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x256x32_32x32_4x4_4x64x1_4x64x1_1x16x1x16_4_Intrawave_v3(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x256x32_32x32_4x4_4x64x1_4x64x1_1x16x1x16_4_Intrawave_v5(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_224x256x64_16x16_7x8_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v3(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x224x64_16x16_8x7_8x32x1_8x32x1_1x32x1x8_4_Intrawave_v3(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_128x128x64_32x32_2x2_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v3(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_128x128x64_32x32_2x2_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v5(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_128x128x64_32x32_2x2_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_32x16x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_64_16x16x64_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x32x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x32x64_16x16_1x1_16x8x1_16x8x1_1x16x1x8_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x32x64_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x16x64_16x16_4x1_8x32x1_8x16x1_1x32x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x16x64_16x16_4x1_16x16x1_16x8x1_1x32x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x16x64_16x16_4x1_32x8x1_32x4x1_1x32x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_128x16x64_16x16_4x1_8x16x1_8x16x1_1x16x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_64x16x64_16x16_2x1_8x16x1_8x16x1_1x16x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_32x16x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_64_16x16x64_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x32x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x64x64_16x16_1x2_8x16x1_8x16x1_1x16x1x8_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x128x64_16x16_1x4_8x16x1_8x16x1_1x16x1x8_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_16x256x64_16x16_1x4_8x16x1_8x16x1_1x16x1x16_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); + +}; // namespace at::native +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_template.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_template.h new file mode 100644 index 0000000000000000000000000000000000000000..28bc774c32ea94bcde8039debea15e61c34376a8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_template.h @@ -0,0 +1,164 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#undef __HIP_NO_HALF_CONVERSIONS__ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at::native { + +// Define commonly used types. +template +using S = ck::Sequence; + +using BF16 = ck::bhalf_t; +using F32 = float; + +using AccDataType = F32; +using DsDataType = ck::Tuple<>; +using CDataType = BF16; +using CShuffleDataType = BF16; +using DsLayout = ck::Tuple<>; +using CLayout = Row; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = PassThrough; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template < + typename A_DATA_TYPE, + typename B_DATA_TYPE, + int BLOCK_SIZE, + int MBLOCK, + int NBLOCK, + int KBLOCK, + int AK1, + int BK1, + int WAVE_TILE_M, + int WAVE_TILE_N, + int WAVE_MAP_M, + int WAVE_MAP_N, + typename ABLOCK_TRANSFER, + int ABLOCK_TRANSFER_SSPV, + int ABLOCK_TRANSFER_DSPV_K1, + typename BBLOCK_TRANSFER, + int BBLOCK_TRANSFER_SSPV, + int BBLOCK_TRANSFER_SSPV_K1, + int CSHUFFLE_MXDL_PWPS, + int CSHUFFLE_NXDL_PWPS, + typename CSHUFFLEBLOCK_TRANSFER, + typename CDESHUFFLEBLOCK_TRANSFER, + ck::BlockGemmPipelineScheduler LOOP_SCHED, + ck::BlockGemmPipelineVersion PIPELINE_VERSION, + ck::tensor_operation::device::GemmSpecialization GEMM_SPEC = + ck::tensor_operation::device::GemmSpecialization::MNPadding, + bool TRANSA = false, + bool TRANSB = false> +void bgemm_kernel_impl(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { + + using ADataType = typename CkMathType::dtype; + using BDataType = typename CkMathType::dtype; + + using ALayout = typename CkTensorLayout::a_layout; + using BLayout = typename CkTensorLayout::b_layout; + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto cde_element_op = CDEElementOp{}; + + auto gemm = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< + ALayout, // ALayout + BLayout, // BLayout + DsLayout, // DsLayout + CLayout, // CLayout + ADataType, // ADataType + BDataType, // BDataType + DsDataType, // DsDataType + CDataType, // CDataType + AccDataType, // AccDataType + CShuffleDataType, // CshuffleType + AElementOp, // AElementwiseOperation + BElementOp, // BElementwiseOperation + CDEElementOp, // CElementwiseOperation + GEMM_SPEC, // GEMMSpecialization + BLOCK_SIZE, // BlockSize + MBLOCK, // MPerBlock + NBLOCK, // NPerBlock + KBLOCK, // KPerBlock + AK1, // AK1 + BK1, // BK1 + WAVE_TILE_M, // MPerXDL + WAVE_TILE_N, // NPerXDL + WAVE_MAP_M, // MXdlPerWave + WAVE_MAP_N, // NXdlPerWave + ABLOCK_TRANSFER, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + ABLOCK_TRANSFER_SSPV, // ABlockTransferSrcScalarPerVector + ABLOCK_TRANSFER_DSPV_K1, // ABlockTransferDstScalarPerVector_AK1 + 0, // ABlockLdsExtraM + BBLOCK_TRANSFER, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + BBLOCK_TRANSFER_SSPV, // BBlockTransferSrcScalarPerVector + BBLOCK_TRANSFER_SSPV_K1, // BBlockTransferDstScalarPerVector_BK1 + 0, // BBlockLdsAddExtraN + CSHUFFLE_MXDL_PWPS, // CShuffleMXdlPerWavePerShuffle + CSHUFFLE_NXDL_PWPS, // CShuffleNXdlPerWavePerShuffle + CSHUFFLEBLOCK_TRANSFER, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + CDESHUFFLEBLOCK_TRANSFER, // CDEShuffleBlockTransferScalarPerVectors + LOOP_SCHED, // BlockGemmPipelineScheduler + PIPELINE_VERSION // BlockGemmPipelineVersion + >{}; + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument( + b, // A and B are swapped for CK + a, + {}, + c, + n, + m, + k, + num_batches, + ldb, + lda, + {}, + ldc, + n * k, // batch_stride_a + m * k, // batch_stride_b + {}, + m * n, // batch_stride_c + a_element_op, + b_element_op, + cde_element_op + ); + TORCH_CHECK(gemm.IsSupportedArgument(argument), "wrong! device_gemm with the specified compilation parameters does not support this GEMM problem"); + auto stream = at::cuda::getCurrentHIPStream().stream(); + invoker.Run(argument, StreamConfig{stream, false}); +} + +}; // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_bgemm.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_bgemm.h new file mode 100644 index 0000000000000000000000000000000000000000..60ebc92c5d6243fc700be875fbbe05ba44ff0970 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_bgemm.h @@ -0,0 +1,21 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +template +inline void bgemm_internal_ck(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { + static_assert(false&&sizeof(Dtype),"at::cuda::blas_bgemm_internal_ck: not implemented"); +} + +template <> +void bgemm_internal_ck(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm.h new file mode 100644 index 0000000000000000000000000000000000000000..a20611c638bf9322a1a423a13ac6b43ee476ba63 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +namespace at::native { + + +template +inline void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + static_assert(false&&sizeof(Dtype),"at::cuda::blas_gemm_internal_ck: not implemented"); +} + +#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) +template <> +void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(double)); +template <> +void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(float)); +template <> +void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(at::Half)); +template <> +void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); +#endif + + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm_template.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm_template.h new file mode 100644 index 0000000000000000000000000000000000000000..e6c2f52e7b518c581fdbb66f68d179d09a5910ca --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm_template.h @@ -0,0 +1,413 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#undef __HIP_NO_HALF_CONVERSIONS__ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// Define commonly used types. +template +using S = ck::Sequence; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +namespace at::native { + +// Elementwise Operators +struct AlphaBetaAdd +{ + AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(C& c, const AB& ab) const; + + template<> + __host__ __device__ constexpr void operator() + (float& c, const float& ab) const + { + c = alpha_ * ab; + }; + + template<> + __host__ __device__ constexpr void operator() + (ck::bhalf_t& c, const ck::bhalf_t& ab) const + { + c = alpha_ * ab; + }; + + template<> + __host__ __device__ constexpr void operator() + (ck::half_t& c, const ck::half_t& ab) const + { + c = alpha_ * ab; + }; + + float alpha_; + // TODO: Leaving for now, will use later + float beta_; +}; + +template < + typename Dtype, + int BLOCK_SIZE, + int MBLOCK, + int NBLOCK, + int KBLOCK, + int AK1, + int BK1, + int MPER_XDL, + int NPER_XDL, + int MPER_WAVE, + int NPER_WAVE, + typename ABLOCK_CLUSTER_LENS, + typename ABLOCK_CLUSTER_ORDER, + typename ABLOCK_SRC_ORDER, + int ABLOCK_VECTOR_DIM, + int ABLOCK_SCALAR_VEC, + int ABLOCK_SCALAR_VEC_AK1, + bool ABLOCK_LDS_EXTRAM, + typename BBLOCK_CLUSTER_LENS, + typename BBLOCK_CLUSTER_ORDER, + typename BBLOCK_SRC_ORDER, + int BBLOCK_VECTOR_DIM, + int BBLOCK_SCALAR_VEC, + int BBLOCK_SCALAR_VEC_AK1, + bool BBLOCK_LDS_EXTRAN, + int CMPER_WAVE, + int CNPER_WAVE, + typename BLOCK_CLUSTER_LENS, + typename CDE_SCALAR_VEC, + bool PADDING = false, + bool TRANSA = false, + bool TRANSB = false> +void gemm_impl(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + // Get input information. + int M = m; + int N = n; + int K = k; + + int StrideA = lda; + int StrideB = ldb; + int StrideC = ldc; + + int KBatch = 1; + + float falpha = alpha; + float fbeta = beta; + + using ADataType = typename CkMathType::dtype; + using BDataType = typename CkMathType::dtype; + using CDataType = typename CkMathType::dtype; + using DDataType = typename CkMathType::dtype; + + using AccDataType = float; + using CShuffleDataType = typename CkMathType::dtype; + + using ALayout = typename CkTensorLayout::a_layout; + using BLayout = typename CkTensorLayout::b_layout; + + using DLayout = Row; + using CLayout = Row; + + using AElementOp = PassThrough; + using BElementOp = PassThrough; + using CElementOp = AlphaBetaAdd; + + + static constexpr auto GemmDefault = + ck::tensor_operation::device::GemmSpecialization::Default; + static constexpr auto GemmMNKPadding = + ck::tensor_operation::device::GemmSpecialization::MNKPadding; + static constexpr auto GemmSpec = PADDING ? GemmMNKPadding : GemmDefault; + + + using DeviceGemmInstance = + ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3, + CLayout, + ADataType, + BDataType, + ck::Tuple<>, + CDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CElementOp, + GemmSpec, + BLOCK_SIZE, + MBLOCK, + NBLOCK, + KBLOCK, + AK1, + BK1, + MPER_XDL, + NPER_XDL, + MPER_WAVE, + NPER_WAVE, + ABLOCK_CLUSTER_LENS, + ABLOCK_CLUSTER_ORDER, + ABLOCK_SRC_ORDER, + ABLOCK_VECTOR_DIM, + ABLOCK_SCALAR_VEC, + ABLOCK_SCALAR_VEC_AK1, + ABLOCK_LDS_EXTRAM, + BBLOCK_CLUSTER_LENS, + BBLOCK_CLUSTER_ORDER, + BBLOCK_SRC_ORDER, + BBLOCK_VECTOR_DIM, + BBLOCK_SCALAR_VEC, + BBLOCK_SCALAR_VEC_AK1, + BBLOCK_LDS_EXTRAN, + CMPER_WAVE, + CNPER_WAVE, + BLOCK_CLUSTER_LENS, + CDE_SCALAR_VEC>; + + + auto gemm = DeviceGemmInstance{}; + auto invoker = gemm.MakeInvoker(); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{alpha, beta}; + + + using DDataArrayType = std::array; + DDataArrayType DDataArray; + + // We swap A and B inputs here as a temporary workaround + auto argument = gemm.MakeArgument( + reinterpret_cast(b), + reinterpret_cast(a), + DDataArray, + reinterpret_cast(c), + N, + M, + K, + StrideB, + StrideA, + std::array{}, + StrideC, + KBatch, + a_element_op, + b_element_op, + c_element_op); + + + TORCH_CHECK(gemm.IsSupportedArgument(argument), "wrong! device_gemm with the specified compilation parameters does not support this GEMM problem"); + + + auto stream = at::cuda::getCurrentHIPStream().stream(); + invoker.Run(argument, StreamConfig{stream, false}); +} + + +template < + typename Dtype, + int BLOCK_SIZE, + int MBLOCK, + int NBLOCK, + int KBLOCK, + int K1, + int MPER_WMMA, + int NPER_WMMA, + int MPER_WAVE, + int NPER_WAVE, + typename ABLOCK_CLUSTER_LENS, + typename ABLOCK_CLUSTER_ORDER, + typename ABLOCK_SRC_ORDER, + int ABLOCK_VECTOR_DIM, + int ABLOCK_SCALAR_VEC, + int ABLOCK_SCALAR_VEC_K1, + bool ABLOCK_LDS_EXTRAM, + typename BBLOCK_CLUSTER_LENS, + typename BBLOCK_CLUSTER_ORDER, + typename BBLOCK_SRC_ORDER, + int BBLOCK_VECTOR_DIM, + int BBLOCK_SCALAR_VEC, + int BBLOCK_SCALAR_VEC_AK1, + bool BBLOCK_LDS_EXTRAN, + int CMPER_WAVE, + int CNPER_WAVE, + typename CBLOCK_CLUSTER_LENS, + int CNPER_BLOCK, + bool PADDING = false, + bool TRANSA = false, + bool TRANSB = false> +void gemm_impl_wmma(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + // Get input information. + int M = m; + int N = n; + int K = k; + + int StrideA = lda; + int StrideB = ldb; + int StrideC = ldc; + + int KBatch = 1; + + float falpha = alpha; + float fbeta = beta; + + using ADataType = typename CkMathType::dtype; + using BDataType = typename CkMathType::dtype; + using CDataType = typename CkMathType::dtype; + using DDataType = typename CkMathType::dtype; + + using AccDataType = float; + using CShuffleDataType = typename CkMathType::dtype; + + using ALayout = typename CkTensorLayout::a_layout; + using BLayout = typename CkTensorLayout::b_layout; + + using DLayout = Row; + using CLayout = Row; + + using AElementOp = PassThrough; + using BElementOp = PassThrough; + using CElementOp = PassThrough; + + + static constexpr auto GemmDefault = + ck::tensor_operation::device::GemmSpecialization::Default; + static constexpr auto GemmMNKPadding = + ck::tensor_operation::device::GemmSpecialization::MNKPadding; + static constexpr auto GemmSpec = PADDING ? GemmMNKPadding : GemmDefault; + + + using DeviceGemmInstance = + ck::tensor_operation::device::DeviceGemmWmma_CShuffle; + + auto gemm = DeviceGemmInstance{}; + auto invoker = gemm.MakeInvoker(); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{}; + + + using DDataArrayType = std::array; + DDataArrayType DDataArray; + + // We swap A and B inputs here as a temporary workaround + auto argument = gemm.MakeArgument( + reinterpret_cast(b), + reinterpret_cast(a), + reinterpret_cast(c), + N, + M, + K, + StrideB, + StrideA, + StrideC, + b_element_op, + a_element_op, + c_element_op); + + + if(!gemm.IsSupportedArgument(argument)) + { + printf("error shape = %ld %ld %ld TRANSA=%d TRANSB=%d \n", + n, m, k,TRANSA, TRANSB); + TORCH_CHECK(false, "wrong! device_gemm with the specified compilation parameters does not support this GEMM problem"); + } + + + auto stream = at::cuda::getCurrentHIPStream().stream(); +#if 1 + invoker.Run(argument, StreamConfig{stream, false}); +#else + float ave_time = invoker.Run(argument, StreamConfig{stream, true}); + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << N <<" " < +#include +#include + +namespace at { +namespace hip { +namespace detail { +void group_gemm_ck( + const at::Tensor& mat_a, + const at::Tensor& mat_b, + const std::optional& offs, + const std::optional& bias, + at::Tensor& out); + +} // namespace detail +} // namespace hip +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_types.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_types.h new file mode 100644 index 0000000000000000000000000000000000000000..f3ea17c0b3089705ee3799f5b9e66f271fb4e9f0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_types.h @@ -0,0 +1,80 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// work around CK assuming only a single FP8 interpretation at a time +#if(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && __HIP_DEVICE_COMPILE__ +#define CK_USE_FNUZ_FP8 1 +#undef CK_USE_OCP_FP8 +#elif __HIP_DEVICE_COMPILE__ +#undef CK_USE_FNUZ_FP8 +#define CK_USE_OCP_FP8 1 +#endif + +#include +#include +#include +#include + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +namespace at::native { + +template +struct CkMathType { + using dtype = T; +}; + +template <> +struct CkMathType { + using dtype = ck::bhalf_t; +}; + +template <> +struct CkMathType { + using dtype = ck::half_t; +}; + +template +struct CkTensorLayout { + // default goes to row-wise for now + using a_layout = Row; + using b_layout = Row; +}; + +// True denotes transpose is necessary. Default is Col, so return Row +template <> +struct CkTensorLayout { + using a_layout = Col; + using b_layout = Col; +}; + +template <> +struct CkTensorLayout { + using a_layout = Row; + using b_layout = Col; +}; + +template <> +struct CkTensorLayout { + using a_layout = Col; + using b_layout = Row; +}; + +template <> +struct CkTensorLayout { + using a_layout = Row; + using b_layout = Row; +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/im2col.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/im2col.h new file mode 100644 index 0000000000000000000000000000000000000000..7ca8091f4a0eac60acba84b08ac0b11e68402f75 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/im2col.h @@ -0,0 +1,154 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace at::native { + +template +static void im2col( + const T* data_im, + const int64_t channels, + const int64_t height, + const int64_t width, + const int64_t output_height, + const int64_t output_width, + const int64_t kernel_h, + const int64_t kernel_w, + const int64_t pad_h, + const int64_t pad_w, + const int64_t stride_h, + const int64_t stride_w, + const int64_t dilation_h, + const int64_t dilation_w, + T* data_col, + bool is_channels_last = false) { + const int64_t height_col = output_height; + const int64_t width_col = output_width; + const int64_t channels_col = channels * kernel_h * kernel_w; + + if (is_channels_last) { + at::parallel_for(0, height_col * width_col, 0, [&](int64_t begin, int64_t end) { + int64_t h_col{0}, w_col{0}; + data_index_init(begin, h_col, height_col, w_col, width_col); + + for (const auto i_col : c10::irange(begin, end)) { + for (const auto h_offset : c10::irange(kernel_h)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_offset : c10::irange(kernel_w)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + + const T* slice_im = data_im + (h_im * width + w_im) * channels; + T* slice_col = data_col + (i_col * kernel_h * kernel_w + h_offset * kernel_w + w_offset) * channels; + + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + std::copy_n(slice_im, channels, slice_col); + } else { + std::fill_n(slice_col, channels, T(0)); + } + } + } + + // move the next index + data_index_step(h_col, height_col, w_col, width_col); + } + }); + } else { + at::parallel_for(0, channels_col, 0, [&](int64_t begin, int64_t end) { + int64_t c_im{0}, h_offset{0}, w_offset{0}; + data_index_init(begin, c_im, channels, h_offset, kernel_h, w_offset, kernel_w); + + for (const auto c_col : c10::irange(begin, end)) { + for (const auto h_col : c10::irange(height_col)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_col : c10::irange(width_col)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + data_col[(c_col * height_col + h_col) * width_col + w_col] = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) + ? c10::load(&(data_im[(c_im * height + h_im) * width + w_im])) + : static_cast(0); + } + } + + // move to the next index + data_index_step(c_im, channels, h_offset, kernel_h, w_offset, kernel_w); + } + }); + } +} + +template +static void col2im( + const T* data_col, + const int64_t channels, + const int64_t height, + const int64_t width, + const int64_t output_height, + const int64_t output_width, + const int64_t kernel_h, + const int64_t kernel_w, + const int64_t pad_h, + const int64_t pad_w, + const int64_t stride_h, + const int64_t stride_w, + const int64_t dilation_h, + const int64_t dilation_w, + T* data_im, + bool is_channels_last = false) { + std::fill_n(data_im, height * width * channels, T(0)); + + const int64_t height_col = output_height; + const int64_t width_col = output_width; + const int64_t channels_col = channels * kernel_h * kernel_w; + + if (is_channels_last) { + for (const auto h_col : c10::irange(height_col)) { + for (const auto w_col : c10::irange(width_col)) { + for (const auto h_offset : c10::irange(kernel_h)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_offset : c10::irange(kernel_w)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + + T* slice_im = data_im + (h_im * width + w_im) * channels; + const T* slice_col = data_col + ((h_col * width_col + w_col) * kernel_h * kernel_w + + h_offset * kernel_w + w_offset) * channels; + + if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) { + std::transform(slice_col, slice_col + channels, slice_im, slice_im, std::plus()); + } + } + } + } + } + } else { + for (const auto c_col : c10::irange(channels_col)) { + int64_t w_offset = c_col % kernel_w; + int64_t h_offset = (c_col / kernel_w) % kernel_h; + int64_t c_im = c_col / kernel_h / kernel_w; + + for (const auto h_col : c10::irange(height_col)) { + int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + for (const auto w_col : c10::irange(width_col)) { + int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + + if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) + data_im[(c_im * height + h_im) * width + w_im] += + data_col[(c_col * height_col + h_col) * width_col + w_col]; + } + } + } + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/im2col_shape_check.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/im2col_shape_check.h new file mode 100644 index 0000000000000000000000000000000000000000..8b2b946c01cf15fdaf723298a4b484071aae918e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/im2col_shape_check.h @@ -0,0 +1,246 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +namespace at::native { + +inline void col2im_shape_check( + const Tensor& input, + const Tensor& grad_output, + int64_t output_height, + int64_t output_width, + int64_t kernel_height, + int64_t kernel_width, + int64_t dilation_height, + int64_t dilation_width, + int64_t pad_height, + int64_t pad_width, + int64_t stride_height, + int64_t stride_width) { + TORCH_CHECK( + kernel_width > 0 && kernel_height > 0, + "kernel size should be greater than zero, but got kernel_height: ", + kernel_height, + " kernel_width: ", + kernel_width); + TORCH_CHECK( + stride_width > 0 && stride_height > 0, + "stride should be greater than zero, but got stride_height: ", + stride_height, + " stride_width: ", + stride_width); + TORCH_CHECK( + dilation_width > 0 && dilation_height > 0, + "dilation should be greater than zero, but got dilation_height: ", + dilation_height, + " dilation_width: ", + dilation_width); + TORCH_CHECK( + pad_width >= 0 && pad_height >= 0, + "padding should be non-negative, but got pad_height: ", + pad_height, + " pad_width: ", + pad_width); + + + int64_t ndim = input.ndimension(); + // allow dim=0 only the batch dimension. + TORCH_CHECK( + (ndim == 2 && input.size(0) != 0 && input.size(1) != 0) || + (ndim == 3 && input.size(1) != 0 && input.size(2) != 0), + "Expected 2D or 3D (batch mode) tensor for input with possibly 0 batch size and non-zero dimensions for input, but got: ", + input.sizes()); + + int64_t batch_dim = (ndim == 3) ? 0 : -1; + int64_t n_input_plane = input.size(batch_dim + 1); + uint64_t prod_kernel_size = 1; + + TORCH_CHECK(!c10::mul_overflows(static_cast(kernel_width), static_cast(kernel_height), &prod_kernel_size), + "Given kernel_width = ", + kernel_width, + " and kernel_height = ", + kernel_height, + " the product of kernel_width and kernel_height overflowed."); + + if (n_input_plane % (kernel_width * kernel_height) != 0) { + TORCH_CHECK(false, + "Expected size of input's dimension 1 to be divisible by the " + "product of kernel_size, but got input.size(1)=", + n_input_plane, + " and kernel_size=(", + kernel_height, + ", ", + kernel_width, + ")."); + } + + int64_t input_length = input.size(batch_dim + 2); + int64_t n_blocks_height = + div_rtn( + output_height + 2 * pad_height - + dilation_height * (kernel_height - 1) - 1, + stride_height) + + 1; + int64_t n_blocks_width = div_rtn( + output_width + 2 * pad_width - + dilation_width * (kernel_width - 1) - 1, + stride_width) + + 1; + + if (input_length != (n_blocks_height * n_blocks_width)) { + TORCH_CHECK(false, + "Given output_size=(", + output_height, + ", ", + output_width, + "), kernel_size=(", + kernel_height, + ", ", + kernel_width, + "), dilation=(", + dilation_height, + ", ", + dilation_width, + "), padding=(", + pad_height, + ", ", + pad_width, + "), stride=(", + stride_height, + ", ", + stride_width, + "), expected size of input's dimension 2 to match the calculated number of ", + "sliding blocks ", + n_blocks_height, + " * ", + n_blocks_width, + " = ", + (n_blocks_height * n_blocks_width), + ", but got input.size(2)=", + input_length, + "."); + } + + TORCH_CHECK( + n_blocks_height >= 1 && n_blocks_width >= 1, + "Given output_size=(", output_height, ", ", output_width, "), ", + "kernel_size=(", kernel_height, ", ", kernel_width, "), ", + "dilation=(", dilation_height, ", ", dilation_width, "), ", + "padding=(", pad_height, ", ", pad_width, "), ", + "stride=(", stride_height, ", ", stride_width, "), ", + "calculated shape of the array of sliding blocks as ", + "(", n_blocks_height, ", ", n_blocks_width, "), ", + "which is too small (non-positive)"); + + if (output_width < 1 || output_height < 1) { + TORCH_CHECK(false, + "Expected output spatial size to be positive, but got: output_size=(", + output_height, + ", ", + output_width, + ")."); + } +} + +inline void im2col_shape_check( + const Tensor& input, + const Tensor& grad_output, + int64_t kernel_height, + int64_t kernel_width, + int64_t dilation_height, + int64_t dilation_width, + int64_t pad_height, + int64_t pad_width, + int64_t stride_height, + int64_t stride_width) { + TORCH_CHECK( + kernel_width > 0 && kernel_height > 0, + "kernel size should be greater than zero, but got kernel_height: ", + kernel_height, + " kernel_width: ", + kernel_width); + + TORCH_CHECK( + dilation_width > 0 && dilation_height > 0, + "dilation should be greater than zero, but got dilation_height: ", + dilation_height, + " dilation_width: ", + dilation_width); + + TORCH_CHECK( + pad_width >= 0 && pad_height >= 0, + "padding should be non-negative, but got pad_height: ", + pad_height, + " pad_width: ", + pad_width); + + TORCH_CHECK( + stride_width > 0 && stride_height > 0, + "stride should be greater than zero, but got stride_height: ", + stride_height, + " stride_width: ", + stride_width); + + int64_t ndim = input.ndimension(); + + // allow dim=0 only the batch dimension. + bool valid_dims = input.size(1) != 0 && input.size(2) != 0; + TORCH_CHECK( + (ndim == 3 && input.size(0) && valid_dims) || + (ndim == 4 && valid_dims && input.size(3) != 0), + "Expected 3D or 4D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ", + input.sizes()); + + int64_t dim_batch = 0; + + if (ndim == 3) { + dim_batch = -1; + } + + int64_t input_height = input.size(dim_batch + 2); + int64_t input_width = input.size(dim_batch + 3); + int64_t output_height = div_rtn( + input_height + 2 * pad_height - + (dilation_height * (kernel_height - 1) + 1), + stride_height) + + 1; + int64_t output_width = div_rtn( + input_width + 2 * pad_width - + (dilation_width * (kernel_width - 1) + 1), + stride_width) + + 1; + + if (output_height < 1 || output_width < 1) { + TORCH_CHECK(false, + "Given input with spatial size (", + input_height, + ", ", + input_height, + "), kernel_size=(", + kernel_height, + ", ", + kernel_width, + "), dilation=(", + dilation_height, + ", ", + dilation_width, + "), padding=(", + pad_height, + ", ", + pad_width, + "), calculated shape of the array of sliding blocks as (", + output_height, + ", ", + output_width, + "), but its components must be at least one."); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_kernels.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..27c797343b6563639c5ec82b95f5e82b10d7efc9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_kernels.h @@ -0,0 +1,48 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#if AT_KLEIDIAI_ENABLED() + +namespace at::native::kleidiai { + +/** + * @brief Rearranges the quantized weight to support kleidiai inference + * @param bl Groupsize for quantization should be multiple of 32 + */ +void kai_pack_int4_rhs( + const Tensor& weight_packed, + const Tensor& weight, + const Tensor& scales, + const std::optional& bias, + const int64_t n, + const int64_t k, + const int64_t bl); + +/** + * @brief Outputs the buffer size for the packed weights + * @param bl Groupsize for quantization. 32 for groupwise , 0 for channelwise + */ +size_t kai_pack_rhs_int4_size( + const int64_t n, + const int64_t k, + const int64_t bl, + at::ScalarType tensor_dtype = at::kFloat); + +/** + * @brief Run 2 operations ( Input quantize and pack -> 4 bit Matmul ) + */ +void kai_quant_pack_lhs_int4_mm( + const Tensor& output, + const Tensor& input, + const Tensor& weight, + const int64_t m, + const int64_t n, + const int64_t k, + const int64_t bl); +} // namespace at::native::kleidiai +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_pack.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_pack.h new file mode 100644 index 0000000000000000000000000000000000000000..30910a6add13b5f452e2f457fe3346bdf34c6014 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_pack.h @@ -0,0 +1,114 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#if AT_KLEIDIAI_ENABLED() + +namespace at::native::kleidiai { + +template +void kai_pack_rhs_groupwise_int4( + T& kernel, + const Tensor& weight_packed, + const Tensor& weight, + const Tensor& scales, + const std::optional& bias, + const int64_t n, + const int64_t k, + const int64_t bl, + const int64_t rhs_stride, + const int64_t scale_stride) { + const auto& ukernel = kernel.ukernel; + const size_t nr = ukernel.get_nr(); + const size_t kr = ukernel.get_kr(); + const size_t sr = ukernel.get_sr(); + auto weight_packed_data = + reinterpret_cast(weight_packed.data_ptr()); + const auto weight_data = weight.data_ptr(); + auto scales_data = scales.const_data_ptr(); + + if (weight_data == nullptr) { + AT_ERROR("kai_pack_rhs_channelwise_int4: Weight data pointer is null"); + } + + if (scales_data == nullptr) { + AT_ERROR("kai_pack_rhs_channelwise_int4: Scales data pointer is null"); + } + + float* bias_ptr = + bias.has_value() ? bias.value().to(kFloat).data_ptr() : NULL; + auto& params = kernel.rhs_pack_params; + + kernel.kai_run_rhs_pack( + /*num_groups=*/1, + n, + k, + nr, + kr, + sr, + bl, + (const uint8_t*)(weight_data), + rhs_stride, + bias_ptr, + scales_data, + scale_stride, + weight_packed_data, + 0, + ¶ms); +} + +template +void kai_pack_rhs_channelwise_int4( + T& kernel, + const Tensor& weight_packed, + const Tensor& weight, + const Tensor& scales, + const std::optional& bias, + const int64_t n, + const int64_t k) { + const auto& ukernel = kernel.ukernel; + const size_t nr = ukernel.get_nr(); + const size_t kr = ukernel.get_kr(); + const size_t sr = ukernel.get_sr(); + auto weight_packed_data = + reinterpret_cast(weight_packed.data_ptr()); + const auto weight_data = weight.data_ptr(); + + const auto scales_data = scales.to(kFloat).data_ptr(); + + if (weight_data == nullptr) { + AT_ERROR("kai_pack_rhs_channelwise_int4: Weight data pointer is null"); + } + + if (scales_data == nullptr) { + AT_ERROR("kai_pack_rhs_channelwise_int4: Scales data pointer is null"); + } + + float* bias_ptr = + bias.has_value() ? bias.value().to(kFloat).data_ptr() : NULL; + auto& params = kernel.rhs_pack_params; + + kernel.kai_run_rhs_pack( + /*num_groups=*/1, + n, + k, + nr, + kr, + sr, + (const uint8_t*)(weight_data), + (const float*)(bias_ptr), + (const float*)(scales_data), + weight_packed_data, + 0, + ¶ms); +} + +} // namespace at::native::kleidiai + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_ukernel_interface.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_ukernel_interface.h new file mode 100644 index 0000000000000000000000000000000000000000..3def42aa5876304b811616cf76cd8299f446fd15 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_ukernel_interface.h @@ -0,0 +1,226 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#if AT_KLEIDIAI_ENABLED() +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native::kleidiai { + +enum class kai_kernel_id { + // FP32 inputs, 4-bit weights, FP32 output + matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod = + 0, // Groupwise 4-bit GEMV (per-group scales, NEON DOTPROD) + matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_4x8x32_neon_i8mm = + 1, // Groupwise 4-bit GEMM (per-group scales, NEON I8MM) + matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod = + 2, // Channelwise 4-bit GEMV (per-channel scales, NEON DOTPROD) + matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm = + 3, // Channelwise 4-bit GEMM (per-channel scales, NEON I8MM) + + // BF16 inputs, 4-bit weights, BF16 output + matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod = + 4, // Channelwise 4-bit GEMV with BF16 input/output + matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm = + 5 // Channelwise 4-bit GEMM with BF16 input/output +}; + +// Channelwise Kernel mapping +struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp { + struct kai_matmul_clamp_f32_qai8dxp_qsi4cxp_ukernel ukernel; + struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params rhs_pack_params; + size_t (*kai_get_lhs_packed_size)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr); + size_t (*kai_get_rhs_packed_size)( + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr); + void (*kai_run_lhs_quant_pack)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr, + size_t m_idx_start, + const float* lhs, + size_t lhs_stride, + void* lhs_packed); + void (*kai_run_rhs_pack)( + size_t num_groups, + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr, + const uint8_t* rhs, + const float* bias, + const float* scale, + void* rhs_packed, + size_t extra_bytes, + const struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params* params); + size_t(*kai_get_lhs_quant_pack_offset)( + size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr + ); + + kai_matmul_ukernel_f32_qa8dxp_qs4cxp( + const kai_matmul_clamp_f32_qai8dxp_qsi4cxp_ukernel& kernel) + : ukernel(kernel), + kai_get_lhs_packed_size( + &kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32), + kai_get_rhs_packed_size( + &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0), + kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_f32), + kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0), + kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32){} +}; + +struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp +kai_select_channelwise_matmul_ukernel(const kai_kernel_id id); + +// bf16 Channelwise Kernel mapping +struct kai_matmul_ukernel_bf16_qa8dxp_qs4cxp { + struct kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_ukernel ukernel; + struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params rhs_pack_params; + size_t (*kai_get_lhs_packed_size)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr); + size_t (*kai_get_rhs_packed_size)( + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr); + void (*kai_run_lhs_quant_pack)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr, + size_t m_idx_start, + const void* lhs, + size_t lhs_stride, + void* lhs_packed); + void (*kai_run_rhs_pack)( + size_t num_groups, + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr, + const uint8_t* rhs, + const float* bias, + const float* scale, + void* rhs_packed, + size_t extra_bytes, + const struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params* params); + size_t(*kai_get_lhs_quant_pack_offset)( + size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr + ); + + kai_matmul_ukernel_bf16_qa8dxp_qs4cxp( + const kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_ukernel& kernel) + : ukernel(kernel), + kai_get_lhs_packed_size( + &kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_bf16_neon), + kai_get_rhs_packed_size( + &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0), + kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_bf16_neon), + kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0), + kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_bf16_neon){} + }; + +struct kai_matmul_ukernel_bf16_qa8dxp_qs4cxp +kai_select_bf16_channelwise_matmul_ukernel(const kai_kernel_id id); + +// Groupwise Kernel mapping +struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p { + struct kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel ukernel; + struct kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params rhs_pack_params; + size_t (*kai_get_lhs_packed_size)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr); + size_t (*kai_get_rhs_packed_size)( + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr, + size_t bl, + enum kai_datatype scale_dt); + void (*kai_run_lhs_quant_pack)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr, + size_t m_idx_start, + const float* lhs, + size_t lhs_stride, + void* lhs_packed); + void (*kai_run_rhs_pack)( + size_t num_groups, + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr, + size_t bl, + const uint8_t* rhs, + size_t rhs_stride, + const float* bias, + const void* scale, + size_t scale_stride, + void* rhs_packed, + size_t extra_bytes, + const struct kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params* params); + size_t(*kai_get_lhs_quant_pack_offset)( + size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr + ); + + kai_matmul_ukernel_f32_qa8dxp_qs4c32p( + const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& kernel) + : ukernel(kernel), + kai_get_lhs_packed_size( + &kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32), + kai_get_rhs_packed_size( + &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0), + kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_f32), + kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0), + kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32) {} +}; + +struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p kai_select_groupwise_matmul_ukernel( + const kai_kernel_id id); + +} // namespace at::native::kleidiai +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/layer_norm.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/layer_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..a667349ad6fe9933d23a76b34345affeb650bdea --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/layer_norm.h @@ -0,0 +1,157 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + + +namespace at::native { + +namespace { + +C10_ALWAYS_INLINE void _check_rms_norm_inputs_symint( + const Tensor& input, + c10::SymIntArrayRef normalized_shape, + const Tensor& weight /* optional */) { + + const int normalized_ndim = normalized_shape.size(); + TORCH_CHECK( + normalized_ndim >= 1, + "Expected normalized_shape to be at least 1-dimensional, i.e., ", + "containing at least one element, but got normalized_shape = ", + normalized_shape); + if (weight.defined()) { + TORCH_SYM_CHECK( + sym_equals(weight.sym_sizes(), normalized_shape), + "Expected weight to be of same shape as normalized_shape, but got ", + "weight of shape ", + weight.sym_sizes(), + " and normalized_shape = ", + normalized_shape); + } + + const auto input_ndim = input.dim(); + const auto input_shape = input.sym_sizes(); + TORCH_CHECK_VALUE( + input_ndim >= normalized_ndim, + "Input tensor must have at least ", normalized_ndim, " dimensions, but got ", input_ndim); + + auto expect_input_shape_msg = c10::str( + "Given normalized_shape=", normalized_shape, + ", expected input with shape [*", c10::Join(", ", normalized_shape), + "], but got input of size", input_shape); + + TORCH_SYM_CHECK( + sym_equals(input_shape.slice(input_ndim - normalized_ndim), normalized_shape), + expect_input_shape_msg); +} + +C10_ALWAYS_INLINE std::pair _check_layer_norm_inputs( + const Tensor& input, + IntArrayRef normalized_shape, + const Tensor& weight /* optional */, + const Tensor& bias /* optional */) { + + const int normalized_ndim = normalized_shape.size(); + TORCH_CHECK( + normalized_ndim >= 1, + "Expected normalized_shape to be at least 1-dimensional, i.e., ", + "containing at least one element, but got normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !weight.defined() || weight.sizes().equals(normalized_shape), + "Expected weight to be of same shape as normalized_shape, but got ", + "weight of shape ", + weight.sizes(), + " and normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !bias.defined() || bias.sizes().equals(normalized_shape), + "Expected bias to be of same shape as normalized_shape, but got ", + "bias of shape ", + bias.sizes(), + " and normalized_shape = ", + normalized_shape); + + const auto input_shape = input.sizes(); + const auto input_ndim = input.dim(); + + if (input_ndim < normalized_ndim || + !input_shape.slice(input_ndim - normalized_ndim) + .equals(normalized_shape)) { + std::stringstream ss; + ss << "Given normalized_shape=" << normalized_shape + << ", expected input with shape [*"; + for (auto size : normalized_shape) { + ss << ", " << size; + } + ss << "], but got input of size" << input_shape; + TORCH_CHECK(false, ss.str()); + } + + const int axis = input_ndim - normalized_ndim; + const int64_t M = + c10::multiply_integers(input_shape.cbegin(), input_shape.cbegin() + axis); + const int64_t N = + c10::multiply_integers(input_shape.cbegin() + axis, input_shape.cend()); + + return std::make_pair(M, N); +} + +} // namespace + +void layer_norm_cpu_out( + at::Tensor& out, + const at::Tensor& input, + const Tensor& gamma, + const Tensor& beta, + double eps, + int64_t M, + int64_t N); + +std::tuple rms_norm_composite( + const Tensor& input, + IntArrayRef normalized_shape, + const std::optional& weight_opt /* optional */, + std::optional eps); + +Tensor rms_norm_symint( + const Tensor& input, + c10::SymIntArrayRef normalized_shape, + const std::optional& weight_opt /* optional */, + std::optional eps); + +using forward_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */, + Tensor* /* mean */, + Tensor* /* rstd */); + +using backward_fn = void (*)( + const Tensor& /* dY */, + const Tensor& /* X */, + const Tensor& /* mean */, + const Tensor& /* rstd */, + const Tensor& /* gamma */, + int64_t /* M */, + int64_t /* N */, + Tensor* /* dX */, + Tensor* /* dgamma */, + Tensor* /* dbeta */); + +DECLARE_DISPATCH(forward_fn, LayerNormKernel) +DECLARE_DISPATCH(backward_fn, LayerNormBackwardKernel) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/Conv.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/Conv.h new file mode 100644 index 0000000000000000000000000000000000000000..81ede7e28c9e451b9d39edb0054244c8b3bec5ad --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/Conv.h @@ -0,0 +1,59 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#if AT_MKLDNN_ENABLED() + +namespace at::native::xpu { +C10_API Tensor convolution_pointwise( + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + +C10_API Tensor convolution_pointwise_binary( + const Tensor& input_t, + const Tensor& other_t, + const Tensor& weight_t, + const std::optional& bias_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + +C10_API Tensor& convolution_pointwise_binary_( + Tensor& other_t, + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + +} // namespace at::native::xpu + +#endif // AT_MKLDNN_ENABLED() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/FusionUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/FusionUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..77cd91aadda0ab29144b7448a4790ae285bb9fda --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/FusionUtils.h @@ -0,0 +1,58 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +// +// This header file provides utility functions for constructing and managing +// oneDNN attributes used in fusion operations on XPU devices. These utilities +// include functions for creating unary and binary post-operations attributes, +// as well as mapping string representations of operations to oneDNN attributes. +// + +namespace at::native::xpu { +at::native::onednn::Attr& unary_attr_with_arg( + onednn::Attr& attr, + std::string_view unary, + torch::List> scalars, + std::optional algorithm); + +at::native::onednn::Attr& string_to_unary_attr( + onednn::Attr& attr, + std::string_view unary); + +at::native::onednn::Attr& construct_unary_attr( + onednn::Attr& attr, + std::string_view unary, + torch::List> scalars, + std::optional algorithm); + +template +onednn::Attr& construct_binary_attr( + onednn::Attr& attr, + std::string_view binary, + const Tensor& other) { + if (binary == "mul") { + attr.append_post_binary(attr.kind_with_binary_mul, other); + } else if (binary == "sub") { + attr.append_post_binary(attr.kind_with_binary_sub, other); + } else if (binary == "div") { + attr.append_post_binary(attr.kind_with_binary_div, other); + } else if (binary == "add") { + attr.append_post_binary(attr.kind_with_binary_add, other); + } else if (binary == "sum") { + attr.append_post_sum(1.f, 1.f, 0); + } else { + TORCH_CHECK( + binary == "none", + "Binary attr ", + binary, + "is not supported for conv/linear post binary fusion"); + } + return attr; +} + +} // namespace at::native::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Attr.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Attr.h new file mode 100644 index 0000000000000000000000000000000000000000..dba538396fe3b1c6ae6af47e8cc7ba5368c910a3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Attr.h @@ -0,0 +1,468 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace at::native::onednn { +/* oneDNN quantization usage: + https://oneapi-src.github.io/oneDNN/dev_guide_attributes_quantization.html# + + src_fp32 = scale_src * (src_int8 - zero_point) + wei_fp32 = scale_wei * (wei_int8 - zero_point) + dst_fp32 = scale_dst * (dst_int8 - zero_point) + fp32 Convolution: dst_fp32 = src_fp32 * wei_fp32 + Int8 Convolution: dst_fp32 = (src_int8 * wei_int8) * (scale_src * scale_wei) + Int8 Convolution: dst_int8 = 1 / scale_dst * dst_fp32; + + Considering zero-point (asymmetric): + dst_fp32 = (src_int8 - src_zp) * src_sc * wei_int8 * wei_sc + dst_sc * (dst_int8 - dst_zp) = (src_int8 - src_zp) * wei_int8 * src_sc * + wei_sc + dst_int8 = (src_int8 - src_zp) * wei_int8 * src_sc * wei_sc / dst_sc + + dst_zp + + considering bias: + fp32 Convolution: dst_fp32 = src_fp32 * wei_fp32 + bias + Int8 Convolution: dst_fp32 = (src_int8 * wei_int8) * (scale_src * scale_wei) + + bias Int8 Convolution: dst_fp32 = (src_int8 * wei_int8 + bias/(scale_src * + scale_wei)) * (scale_src * scale_wei) Int8 Convolution: dst_int8 = 1 / + scale_dst * dst_fp32; +*/ + +/* + oneDNN postops usage: + Currently, oneDNN supports 5 kinds of post ops. More details can be referred +to oneDNN doc. + https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise + +0. without post ops + dst = Conv(src, wei) + bias; + dst_int8 = 1/q_scale * dst; q_scale is the op output quantization scale + fp32 API: Attr attr; + int8 API: Attr attr(q_scale); + +1. append eltwise post op + dst = elt_scale * Eltwise{conv_scale * [Conv(src, wei) + bias], alpha, beta} + dst_int8 = 1/q_scale * dst; + fp32 API: + Attr attr; + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_eltwise(elt_scale, alpha, beta, eltwise_algorithm) + int8 API: + Attr attr(q_scale); + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_eltwise(elt_scale, alpha, beta, eltwise_algorithm) + +2. append sum post op + dst = conv_scale * Conv(src, wei) + sum_scale * (dst - zp) + dst_int8 = 1/q_scale * dst; + fp32 API: + Attr attr; + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_sum(sum_scale) + int8 API: + Attr attr(q_scale); + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_sum(sum_scale) + +3. append binary post op + dst = Binary[Conv(src, wei)] + +*/ +using kind_t = dnnl::primitive::kind; +struct PostOpParam { + // eltwise post op constructor + PostOpParam( + float scale, + float alpha, + float beta, + dnnl::algorithm algo, + kind_t kind) + : scale_(scale), alpha_(alpha), beta_(beta), algo_(algo), kind_(kind) {} + // sum post op constructor + PostOpParam(float scale, kind_t kind) : scale_(scale), kind_(kind) {} + // sum post op with zp + PostOpParam(float scale, int64_t zero_point, kind_t kind) + : scale_(scale), zero_point_(zero_point), kind_(kind) {} + // binary post op constructor + PostOpParam( + at::Tensor& binary, + dnnl::memory::desc& binary_md, + dnnl::memory::desc& expected_md, + dnnl::algorithm algo, + kind_t kind) + : binary_(binary), + meta_(binary_md), + expected_meta_(expected_md), + algo_(algo), + kind_(kind) {} + // prelu post op constructor + PostOpParam(int mask, kind_t kind) : mask_(mask), kind_(kind) {} + + // post sum or binary with scale post op constructor + PostOpParam( + at::Tensor& binary, + float scale, + dnnl::algorithm algo, + kind_t kind) + : scale_(scale), binary_(binary), algo_(algo), kind_(kind) {} + + // for int8 sum/eltwise + float scale_ = 1.0; + int64_t zero_point_ = 0; + // for eltwise + float alpha_ = 0.0; + float beta_ = 0.0; + // for binary + at::Tensor binary_ = at::Tensor(); + at::Tensor expected_binary_ = at::Tensor(); + void* binary_ptr_ = nullptr; + dnnl::memory::desc meta_ = dnnl::memory::desc(); + dnnl::memory::desc expected_meta_ = dnnl::memory::desc(); + // for prelu + int mask_ = 0; + // common + dnnl::algorithm algo_ = dnnl::algorithm::eltwise_relu; + kind_t kind_ = kind_t::eltwise; +}; + +class Attr { + public: + Attr() : q_scale_(1.f) {} + Attr(float q_scale, int64_t zp = 0) : q_scale_(q_scale), q_zero_point_(zp) {} + + /***** eltwise *****/ + dnnl::algorithm kind_with_relu = dnnl::algorithm::eltwise_relu; + dnnl::algorithm kind_with_sigmoid = dnnl::algorithm::eltwise_logistic; + dnnl::algorithm kind_with_gelu_tanh = dnnl::algorithm::eltwise_gelu_tanh; + dnnl::algorithm kind_with_gelu_erf = dnnl::algorithm::eltwise_gelu_erf; + dnnl::algorithm kind_with_mish = dnnl::algorithm::eltwise_mish; + dnnl::algorithm kind_with_linear = dnnl::algorithm::eltwise_linear; + dnnl::algorithm kind_with_swish = dnnl::algorithm::eltwise_swish; + dnnl::algorithm kind_with_sqrt = dnnl::algorithm::eltwise_sqrt; + dnnl::algorithm kind_with_tanh = dnnl::algorithm::eltwise_tanh; + dnnl::algorithm kind_with_square = dnnl::algorithm::eltwise_square; + dnnl::algorithm kind_with_abs = dnnl::algorithm::eltwise_abs; + dnnl::algorithm kind_with_exp = dnnl::algorithm::eltwise_exp; + dnnl::algorithm kind_with_log = dnnl::algorithm::eltwise_log; + dnnl::algorithm kind_with_round = dnnl::algorithm::eltwise_round; + dnnl::algorithm kind_with_hardswish = dnnl::algorithm::eltwise_hardswish; + dnnl::algorithm kind_with_soft_relu = dnnl::algorithm::eltwise_soft_relu; + dnnl::algorithm kind_with_elu = dnnl::algorithm::eltwise_elu; + dnnl::algorithm kind_with_pow = dnnl::algorithm::eltwise_pow; + dnnl::algorithm kind_with_clip = dnnl::algorithm::eltwise_clip; + // note: hardsigmoid seems oneDNN still not support + dnnl::algorithm kind_with_hardsigmoid = dnnl::algorithm::eltwise_hardsigmoid; + + /***** binary *****/ + dnnl::algorithm kind_with_binary_mul = dnnl::algorithm::binary_mul; + dnnl::algorithm kind_with_binary_add = dnnl::algorithm::binary_add; + dnnl::algorithm kind_with_binary_sub = dnnl::algorithm::binary_sub; + dnnl::algorithm kind_with_binary_div = dnnl::algorithm::binary_div; + dnnl::algorithm kind_with_binary_eq = dnnl::algorithm::binary_eq; + dnnl::algorithm kind_with_binary_ne = dnnl::algorithm::binary_ne; + dnnl::algorithm kind_with_binary_ge = dnnl::algorithm::binary_ge; + dnnl::algorithm kind_with_binary_gt = dnnl::algorithm::binary_gt; + dnnl::algorithm kind_with_binary_le = dnnl::algorithm::binary_le; + dnnl::algorithm kind_with_binary_lt = dnnl::algorithm::binary_lt; + dnnl::algorithm kind_with_binary_max = dnnl::algorithm::binary_max; + dnnl::algorithm kind_with_binary_min = dnnl::algorithm::binary_min; + + // append sum post op + Attr& append_post_sum( + float sum_scale, + float sum_q_scale = 1.f, + int64_t zp = 0) { + ops_params_.push_back( + PostOpParam(/*scale_sum*/ sum_scale * sum_q_scale, zp, kind_t::sum)); + return *this; + } + + // append eltwise post op + Attr& append_post_eltwise( + float scale, + float alpha, + float beta, + dnnl::algorithm algo) { + ops_params_.push_back( + PostOpParam(scale, alpha, beta, algo, kind_t::eltwise)); + return *this; + } + + // append binary post op + template + Attr& append_post_binary(dnnl::algorithm algo, const at::Tensor& binary) { + auto binary_ = binary.is_quantized() ? at::dequantize(binary) : binary; + bool binary_is_channels_last = + (binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast || + binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d); + + if constexpr (!is_matmul) { + binary_ = binary_is_channels_last ? binary_ : binary_.contiguous(); + } + dnnl::memory::desc md = get_onednn_md(binary_); + auto expected_md = dnnl::memory::desc( + md.get_dims(), md.get_data_type(), dnnl::memory::format_tag::any); + if constexpr (is_matmul) { + ops_params_.push_back(PostOpParam(binary_, md, md, algo, kind_t::binary)); + } else { + ops_params_.push_back( + PostOpParam(binary_, md, expected_md, algo, kind_t::binary)); + } + + return *this; + } + + Attr& append_scale_binary( + dnnl::algorithm algo, + at::Tensor binary, + float scale, + float sum_q_scale = 1.f, + int64_t zp = 0) { + ops_params_.push_back(PostOpParam( + binary, /*scale_sum*/ scale * sum_q_scale, algo, kind_t::binary)); + return *this; + } + + // append bias with binary_add method (only used for QConv now) + Attr& append_bias(const at::Tensor& binary, const int ndimension) { + // In PyTorch, bias are in shape of [OC], + // we expand its shape according to Conv dimension + // Conv1d [OC, 1, 1], Conv2d [1, OC, 1, ,1], Conv3d [1, OC, 1, 1, 1] + at::Tensor binary_ = binary.contiguous(); + dnnl::memory::desc binary_md; + switch (ndimension) { + case 1: + binary_md = dnnl::memory::desc( + {binary.size(0), 1, 1}, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abc); + break; + case 2: + binary_md = dnnl::memory::desc( + {1, binary.size(0), 1, 1}, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abcd); + break; + case 3: + binary_md = dnnl::memory::desc( + {1, binary.size(0), 1, 1, 1}, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abcde); + break; + default: + TORCH_INTERNAL_ASSERT( + 0, "XPU only supports append_bias for Conv1d, Conv2d and Conv3d."); + } + // In this case, expected_md = binary_md + ops_params_.push_back(PostOpParam( + binary_, binary_md, binary_md, kind_with_binary_add, kind_t::binary)); + return *this; + } + + // append prelu post op + Attr& append_post_prelu(int mask) { + ops_params_.push_back(PostOpParam(mask, kind_t::prelu)); + return *this; + } + + dnnl::post_ops extract_post_ops(const at::Tensor& dst) { + // this function is used to extract post ops params from the ops_params_ + // and put them into onednn post ops + for (size_t i = 0; i < ops_params_.size(); ++i) { + kind_t kind = ops_params_[i].kind_; + switch (kind) { + case kind_t::eltwise: { + dnnl::algorithm algo = ops_params_[i].algo_; + float alpha = ops_params_[i].alpha_; + float beta = ops_params_[i].beta_; + dnnl_post_ops_.append_eltwise(algo, alpha, beta); + break; + } + case kind_t::sum: { + float scale = ops_params_[i].scale_; + int64_t zero_point = ops_params_[i].zero_point_; + // TODO [Asymmetric]: + // Post-sum zp for gpu is not supported currently + dnnl_post_ops_.append_sum(scale, zero_point); + break; + } + case kind_t::binary: { + dnnl::algorithm algo = ops_params_[i].algo_; + auto expected_md = ops_params_[i].expected_meta_; + // In this case user may create src1 memory descriptor with + // format_tag::any or set a specific tag. However, in later case if + // tags mismatch with dst, it would result in suboptimal performance. + // So here we use format_tag::any to make sure the fast can be + // selected. + // Thus we use expected_md (with format_any) here to create pd instead + // of original md + dnnl_post_ops_.append_binary(algo, expected_md); + break; + } + default: + break; + } + } + + return dnnl_post_ops_; + } + + bool with_sum() { + for (size_t i = 0; i < ops_params_.size(); ++i) { + if (ops_params_[i].kind_ == kind_t::sum) { + return true; + } + } + return false; + } + + bool with_binary() { + for (size_t i = 0; i < ops_params_.size(); ++i) { + if (ops_params_[i].kind_ == kind_t::binary) { + return true; + } + } + return false; + } + + void construct_post_binary( + dnnl::primitive_desc& pd, + std::unordered_map& args) { + // This function is used to construct binary memory desc in binary post ops. + // According to oneDNN doc, the binary tensor can be in shape of + // [1, 1, 1, 1], tensor broadcast + // [1, C, 1, 1], channel broadcast + // [dst.shape], no broadcast and eltwise-wise binary operations on dst + + auto& engine = GpuEngineManager::Instance().get_engine(); + for (size_t i = 0; i < ops_params_.size(); ++i) { + kind_t kind = ops_params_[i].kind_; + if (kind == kind_t::binary) { + dnnl::memory binary_m; + auto binary = ops_params_[i].binary_; + auto md = ops_params_[i].meta_; + // query expected_md to achieve peak performance + auto expected_md = pd.query_md( + dnnl::query::exec_arg_md, + DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1); + + binary_m = at::native::onednn::make_onednn_memory( + md, engine, binary.data_ptr()); + + args.insert( + {DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binary_m}); + } + } + } + + float q_scale_ = 1.0; // the scale used to quantize the fused result from fp32 + // to int8, only works for int8 case + int64_t q_zero_point_ = 0; + std::vector ops_params_; // series of post ops + dnnl::post_ops dnnl_post_ops_; +}; + +static inline void construct_attr_for_unary( + const std::string_view& unary_post_op, + const torch::List>& unary_post_op_args, + const std::string_view& unary_post_op_algorithm, + at::native::onednn::Attr& attr) { + if (unary_post_op == "relu") { + attr = attr.append_post_eltwise( + /* eltwise_scale */ 1.f, + /* alpha */ 0.f, + /* beta */ 0.f, + attr.kind_with_relu); + } else if (unary_post_op == "leaky_relu") { + auto alpha = unary_post_op_args[0].value().to(); + attr = attr.append_post_eltwise(1.0, alpha, 0.f, attr.kind_with_relu); + } else if (unary_post_op == "tanh") { + attr = attr.append_post_eltwise(1.0f, 0.0f, 0.0f, attr.kind_with_tanh); + } else if (unary_post_op == "gelu") { + auto post_algorithm = unary_post_op_algorithm == "none" + ? attr.kind_with_gelu_erf + : attr.kind_with_gelu_tanh; + attr = attr.append_post_eltwise(1.0f, 0.0f, 0.0f, post_algorithm); + } else if (unary_post_op == "hardtanh") { + auto alpha = unary_post_op_args[0].value().to(); + auto beta = unary_post_op_args[1].value().to(); + attr = attr.append_post_eltwise(1.0, alpha, beta, attr.kind_with_clip); + } else if (unary_post_op == "hardswish") { + attr = attr.append_post_eltwise( + 1.0f, 1.f / 6.f, 1.f / 2.f, attr.kind_with_hardswish); + } else if (unary_post_op == "swish") { + attr = attr.append_post_eltwise(1.0f, 1.0f, 0.0f, attr.kind_with_swish); + } else { + TORCH_CHECK( + unary_post_op == "none", + "onednn qlinear: unsupported unary post op", + unary_post_op); + } +} + +static inline void construct_attr_by_post_op( + const std::string_view& binary_post_op, + double binary_alpha, + double input1_scale, + int64_t input1_zero_point, + std::optional accum, + const std::string_view& unary_post_op, + const torch::List>& unary_post_op_args, + const std::string_view& unary_post_op_algorithm, + at::native::onednn::Attr& attr) { + bool is_none_post_op = + (binary_post_op == "none" && unary_post_op == "none"); // not post-ops + bool is_unary_post_op_only = + (binary_post_op == "none" && unary_post_op != "none"); // ex., conv + relu + bool is_valid_binary_combination = + (binary_post_op == "add" || binary_post_op == "sum") && + (unary_post_op == "none" || unary_post_op == "relu"); + TORCH_INTERNAL_ASSERT( + is_unary_post_op_only || is_none_post_op || is_valid_binary_combination, + "Please provide valid combination of unary post operators and binary post operators"); + + if (binary_post_op == "none") { + construct_attr_for_unary( + unary_post_op, unary_post_op_args, unary_post_op_algorithm, attr); + } else if (binary_post_op == "sum") { + if (unary_post_op == "none") { + if (input1_zero_point != 0) + attr = attr.append_post_eltwise( + /*scale*/ 1.f, + /*alpha*/ 1.f, + -input1_zero_point * input1_scale, + attr.kind_with_linear); + attr = attr.append_post_sum(1, input1_scale, /*input1_zero_point*/ 0); + } else if (unary_post_op == "relu") { + if (input1_zero_point != 0) + attr = attr.append_post_eltwise( + /*scale*/ 1.f, + /*alpha*/ 1.f, + -input1_zero_point * input1_scale, + attr.kind_with_linear); + attr = attr.append_post_sum(1, input1_scale, /*input1_zero_point*/ 0); + attr = attr.append_post_eltwise( + /* scale */ 1.f, + /* alpha */ 0.f, + /* beta */ 0.f, + attr.kind_with_relu); + } + } else if (binary_post_op == "add") { + TORCH_CHECK(accum.has_value()); + attr = attr.append_post_binary(attr.kind_with_binary_add, accum.value()); + if (unary_post_op == "relu") { + attr = attr.append_post_eltwise(1.f, 0.f, 0.f, attr.kind_with_relu); + } + } +} + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/DnnlExt.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/DnnlExt.h new file mode 100644 index 0000000000000000000000000000000000000000..eb755555172f640ee919795974124a6466e55b89 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/DnnlExt.h @@ -0,0 +1,599 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include + +#include +#include + +namespace std { + +template <> +struct hash { + size_t operator()(dnnl::memory::dims const& vec) const { + size_t seed = vec.size(); + for (auto& i : vec) { + seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; + +} // namespace std + +using namespace dnnl; + +namespace at::native::onednn { + +class primitive_ext : public primitive { + static constexpr int max_args = 12; + + public: + primitive_ext(const primitive& base) : primitive(base) {} + primitive_ext(primitive&& base) : primitive(std::move(base)) {} + + /// Returns a memory descriptor. + /// + /// @note + /// There are also convenience methods + /// #dnnl::primitive_desc_base::src_desc(), + /// #dnnl::primitive_desc_base::dst_desc(), and others. + /// + /// @param what The kind of parameter to query; can be + /// #dnnl::query::src_md, #dnnl::query::dst_md, etc. + /// @param idx Index of the parameter. For example, convolution bias can + /// be queried with what = #dnnl::query::weights_md and idx = 1. + /// @returns The requested memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// parameter of the specified kind or index. + const_dnnl_memory_desc_t query_md(query what, int idx = 0) const { + std::vector valid_q{ + query::src_md, + query::diff_src_md, + query::weights_md, + query::diff_weights_md, + query::dst_md, + query::diff_dst_md, + query::workspace_md, + query::scratchpad_md, + query::exec_arg_md}; + if (!std::any_of(valid_q.cbegin(), valid_q.cend(), [=](query q) { + return what == q; + })) + DNNL_THROW_ERROR( + dnnl_invalid_arguments, "memory descriptor query is invalid"); + + const_dnnl_memory_desc_t cdesc = dnnl_primitive_desc_query_md( + this->get_primitive_desc(), dnnl::convert_to_c(what), idx); + + return cdesc ? cdesc : nullptr; + } + + /// Returns a source memory descriptor. + /// @param idx Source index. + /// @returns Source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// source parameter with index @p idx. + const_dnnl_memory_desc_t src_desc(int idx) const { + return query_md(query::src_md, idx); + } + + /// Returns a destination memory descriptor. + /// @param idx Destination index. + /// @returns Destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// destination parameter with index @p idx. + const_dnnl_memory_desc_t dst_desc(int idx) const { + return query_md(query::dst_md, idx); + } + + /// Returns a weights memory descriptor. + /// @param idx Weights index. + /// @returns Weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// weights parameter with index @p idx. + const_dnnl_memory_desc_t weights_desc(int idx) const { + return query_md(query::weights_md, idx); + } + + /// Returns a diff source memory descriptor. + /// @param idx Diff source index. + /// @returns Diff source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff source parameter with index @p idx. + const_dnnl_memory_desc_t diff_src_desc(int idx) const { + return query_md(query::diff_src_md, idx); + } + + /// Returns a diff destination memory descriptor. + /// @param idx Diff destination index. + /// @returns Diff destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff destination parameter with index @p idx. + const_dnnl_memory_desc_t diff_dst_desc(int idx) const { + return query_md(query::diff_dst_md, idx); + } + + /// Returns a diff weights memory descriptor. + /// @param idx Diff weights index. + /// @returns Diff weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff weights parameter with index @p idx. + const_dnnl_memory_desc_t diff_weights_desc(int idx) const { + return query_md(query::diff_weights_md, idx); + } + + const_dnnl_memory_desc_t exec_arg_desc(int idx) const { + return query_md(query::exec_arg_md, idx); + } + + // Separate versions without the index argument for documentation + // purposes. + + /// Returns a source memory descriptor. + /// @returns Source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// source parameter. + const_dnnl_memory_desc_t src_desc() const { + return src_desc(0); + } + + /// Returns a destination memory descriptor. + /// @returns Destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// destination parameter. + const_dnnl_memory_desc_t dst_desc() const { + return dst_desc(0); + } + + /// Returns a weights memory descriptor. + /// @returns Weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// weights parameter. + const_dnnl_memory_desc_t weights_desc() const { + return weights_desc(0); + } + + /// Returns a diff source memory descriptor. + /// @returns Diff source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff source memory with. + const_dnnl_memory_desc_t diff_src_desc() const { + return diff_src_desc(0); + } + + /// Returns a diff destination memory descriptor. + /// @returns Diff destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff destination parameter. + const_dnnl_memory_desc_t diff_dst_desc() const { + return diff_dst_desc(0); + } + + /// Returns a diff weights memory descriptor. + /// @returns Diff weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff weights parameter. + const_dnnl_memory_desc_t diff_weights_desc() const { + return diff_weights_desc(0); + } + + /// Returns the workspace memory descriptor. + /// @returns Workspace memory descriptor. + /// @returns A zero memory descriptor if the primitive does not require + /// workspace parameter. + const_dnnl_memory_desc_t workspace_desc() const { + return query_md(query::workspace_md, 0); + } + + /// Returns the scratchpad memory descriptor. + /// @returns scratchpad memory descriptor. + /// @returns A zero memory descriptor if the primitive does not require + /// scratchpad parameter. + /// @sa @ref dev_guide_attributes_scratchpad + const_dnnl_memory_desc_t scratchpad_desc() const { + return query_md(query::scratchpad_md, 0); + } + + inline memory make_memory( + const_dnnl_memory_desc_t md_t, + const engine& aengine, + void* handle = DNNL_MEMORY_ALLOCATE) const { + sycl_interop::memory_kind kind = dnnl::sycl_interop::memory_kind::usm; + dnnl_memory_t c_memory; + error::wrap_c_api( + dnnl_sycl_interop_memory_create( + &c_memory, md_t, aengine.get(), convert_to_c(kind), handle), + "could not create a memory"); + return memory(c_memory); + } + + memory make_src(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE) + const { + return make_memory(src_desc(), aengine, handle); + } + + memory make_weight(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE) + const { + return make_memory(weights_desc(), aengine, handle); + } + + memory make_bias(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE) + const { + return make_memory(weights_desc(1), aengine, handle); + } + + memory make_dst(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE) + const { + return make_memory(dst_desc(), aengine, handle); + } + + memory make_scratchpad( + const engine& aengine, + void* handle = DNNL_MEMORY_ALLOCATE) const { + return make_memory(scratchpad_desc(), aengine, handle); + } + + size_t get_scratchpad_size() const { + return dnnl_memory_desc_get_size(scratchpad_desc()); + } + + memory make_args(int arg_class, const engine& aengine, void* handle) const { + switch (arg_class) { + case DNNL_ARG_SRC: + return make_src(aengine, handle); + case DNNL_ARG_WEIGHTS: + return make_weight(aengine, handle); + case DNNL_ARG_SCRATCHPAD: + return make_scratchpad(aengine, handle); + case DNNL_ARG_DST: + return make_dst(aengine, handle); + case DNNL_ARG_BIAS: + return make_bias(aengine, handle); + default: + TORCH_INTERNAL_ASSERT( + false, "unsupported argument class for primitive_ext"); + } + } + + template + void set_attribute(int slot, int arg_class, void* handle, M constructor) { + if (mem_arg_cache[slot]) + mem_arg_cache[slot].set_data_handle(handle); + else { + mem_arg_cache[slot] = constructor(); + c_args[slot].arg = arg_class; + c_args[slot].memory = mem_arg_cache[slot].get(); + } + } + + sycl::event execute( + const stream& astream, + const engine& aengine, + std::vector>&& handles, + int slot_off = 2) { + auto off = slot_off; + for (const auto& p : handles) { + auto& m_arg = mem_arg_cache[off]; + if (m_arg) + m_arg.set_data_handle(p.second); + else { + m_arg = make_args(p.first, aengine, p.second); + c_args[off].arg = p.first; + c_args[off].memory = m_arg.get(); + } + ++off; + } + + sycl::event return_event; + std::vector deps{}; + error::wrap_c_api( + dnnl_sycl_interop_primitive_execute( + this->get(), astream.get(), off, c_args, &deps, &return_event), + "could not execute a primitive"); + return return_event; + } + + private: + memory mem_arg_cache[max_args]; + dnnl_exec_arg_t c_args[max_args]; +}; + +// Specifies the combined data types of input and weight tensors. +// For example, f32 means both input and weight are FP32, +// bf16_int4 means input is BF16 and weight is INT4. +enum class joint_dtypes_t { f32 = 0, f16, bf16, int8, f16_int4, bf16_int4 }; + +// Specifies the transposition state of input and weight tensors. +// Convention: first letter = input, second letter = weight. +// 'n' = not transposed, 't' = transposed. +// For example, 'nt' means input is not transposed, weight is transposed. +enum class trans_type_t { nn = 0, nt, tn, tt }; + +// Specifies the type and placement of bias in the computation. +// 'none' = no bias, +// 'scalar' = a single scalar bias applied to all elements, +// 'm' = per-row bias (typically matched to input rows), +// 'n' = per-column bias (typically matched to output channels), +// 'mn' = full bias matrix matching the output dimensions. +enum class bias_type_t { none = 0, scalar, m, n, mn }; + +template +T concat(const T& t1, at::ScalarType d) { + T t; + t.insert(t.end(), t1.begin(), t1.end()); + t.push_back((int64_t)d); + + return t; +} + +template +T concat(const T& t1, bool b) { + T t; + t.insert(t.end(), t1.begin(), t1.end()); + t.push_back(b); + + return t; +} + +template +T concat(const T& t1, int b) { + T t; + t.insert(t.end(), t1.begin(), t1.end()); + t.push_back(b); + + return t; +} + +template +T concat(const T& t1, const T& t2) { + T t; + t.insert(t.end(), t1.begin(), t1.end()); + t.insert(t.end(), t2.begin(), t2.end()); + + return t; +} + +template +T1 concat(const T1& t1, const T2& t2, const Ts&... ts) { + return concat(concat(t1, t2), ts...); +} + +template +struct onednn_types_mapper; + +template <> +struct onednn_types_mapper { + static inline std::tuple + get() { + return std::make_tuple( + dnnl::memory::data_type::f16, dnnl::memory::data_type::u4); + } +}; + +template <> +struct onednn_types_mapper { + static inline std::tuple + get() { + return std::make_tuple( + dnnl::memory::data_type::bf16, dnnl::memory::data_type::u4); + } +}; + +// TODO: bias types maybe not right +static inline dnnl::memory::dims get_bias_type( + bias_type_t b_dims, + const int m, + const int n) { + switch (b_dims) { + case bias_type_t::none: + return {0}; + case bias_type_t::scalar: + return {1, 1}; + case bias_type_t::m: + return {m, 1}; + case bias_type_t::n: + return {1, n}; + case bias_type_t::mn: + return {m, n}; + default: + TORCH_INTERNAL_ASSERT(false, "unsupported bias type ..."); + } +} + +// TODO: use template specialization on struct +template +inline void get_strides( + memory::dims& src_strides, + memory::dims& wei_strides, + memory::dims& dst_strides, + const int64_t lda, + const int64_t ldb, + const int64_t ldc) {} + +template <> +inline void get_strides( + memory::dims& src_strides, + memory::dims& wei_strides, + memory::dims& dst_strides, + const int64_t lda, + const int64_t ldb, + const int64_t ldc) { + src_strides = {lda, 1}; + wei_strides = {1, ldb}; + dst_strides = {ldc, 1}; +} + +using primitive_cache = + at::native::onednn::lru_cache; + +template +struct matmul_primitive_cache_t { + static inline primitive_ext& get( + const int m, + const int n, + const int k, + const int64_t lda, + const int64_t ldb, + const int64_t ldc, + const bias_type_t + b_dims, // for shapeless bias, not put it into template parameter + const int device_id, + F f_attr, + const int64_t scale_group_size, + const int64_t zp_group_size) { + auto& cached = get_cache(device_id); + memory::dims src_strides, wei_strides, dst_strides; + get_strides(src_strides, wei_strides, dst_strides, lda, ldb, ldc); + auto pri_key = at::native::onednn::concat( + src_strides, + wei_strides, + m, + n, + k, + int(b_dims), + int(scale_group_size), + int(zp_group_size)); + auto iter = cached.find(pri_key); + if (iter == cached.end()) { + auto [src_dt, wei_dt] = onednn_types_mapper::get(); + auto bias_dims = get_bias_type(b_dims, m, n); + + auto src_md = memory::desc({m, k}, src_dt, src_strides); + auto wei_md = memory::desc({k, n}, wei_dt, wei_strides); + auto dst_md = memory::desc({m, n}, src_dt, dst_strides); + auto bias_format = b_dims == bias_type_t::none + ? dnnl::memory::format_tag::undef + : dnnl::memory::format_tag::ab; + auto bias_md = + memory::desc(bias_dims, src_dt, bias_format); // {m, n} or {1, n} + + primitive_attr pattr; + f_attr(pattr); + + dnnl::matmul::primitive_desc matmul_pd; + auto aengine = + at::native::onednn::GpuEngineManager::Instance().get_engine( + device_id); + if (b_dims == bias_type_t::none) { + matmul_pd = dnnl::matmul::primitive_desc( + aengine, src_md, wei_md, dst_md, pattr); + } else { + matmul_pd = dnnl::matmul::primitive_desc( + aengine, src_md, wei_md, bias_md, dst_md, pattr); + } + + return cached.insert({pri_key, primitive_ext(dnnl::matmul(matmul_pd))}) + .first->second; + } else { + return iter->second; + } + } + + private: + static constexpr int max_cache_capacity = 512; + // if default constructor of primitive cache could read the environment + // variable then it'll save a lot of trouble + static inline thread_local std::array mappings; + + // this won't be needed if primitive_cache have good default constructor + static inline primitive_cache& get_cache(const int device_id) { + auto& mapping = mappings[device_id]; + if (mapping.max_size() == 0) { + mapping.resize(max_cache_capacity); + } + return mapping; + } +}; + +template +static inline primitive_ext& matmul_primitive_create_and_cache( + const trans_type_t Tt, + const bias_type_t b_dims, + const int m, + const int n, + const int k, + const int64_t lda, + const int64_t ldb, + const int64_t ldc, + const int device_id, + F attr, + const int64_t scale_group_size, + const int64_t zp_group_size) { + switch (Tt) { + case trans_type_t::nt: + return matmul_primitive_cache_t::get( + m, + n, + k, + lda, + ldb, + ldc, + b_dims, + device_id, + attr, + scale_group_size, + zp_group_size); + default: + TORCH_INTERNAL_ASSERT(false, "unsupported trans type ..."); + } +} + +template +static inline primitive_ext& matmul_primitive_create_and_cache( + const joint_dtypes_t Ts, + const trans_type_t Tt, + const bias_type_t b_dims, + const int m, + const int n, + const int k, + const int64_t lda, + const int64_t ldb, // is weight ldb necessary? + const int64_t ldc, + const int device_id, + F attr, + const int64_t scale_group_size = 0, + const int64_t zp_group_size = 0) { + switch (Ts) { + case joint_dtypes_t::f16_int4: + return matmul_primitive_create_and_cache( + Tt, + b_dims, + m, + n, + k, + lda, + ldb, + ldc, + device_id, + attr, + scale_group_size, + zp_group_size); + case joint_dtypes_t::bf16_int4: + return matmul_primitive_create_and_cache( + Tt, + b_dims, + m, + n, + k, + lda, + ldb, + ldc, + device_id, + attr, + scale_group_size, + zp_group_size); + default: + TORCH_INTERNAL_ASSERT(false, "Only support int4 ..."); + } +} + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/LRUCache.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/LRUCache.h new file mode 100644 index 0000000000000000000000000000000000000000..b4de22cd3fcca003c280d5d00acee4fefbec685b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/LRUCache.h @@ -0,0 +1,115 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native::onednn { + +template < + class key_t, + class value_t, + template class map_t = std::unordered_map> +class lru_cache { + public: + using value_type = std::pair; + using list_type = std::list; + using list_iter = typename list_type::iterator; + using map_type = map_t; + using const_list_iter = typename list_type::const_iterator; + using size_type = typename list_type::size_type; + + explicit lru_cache(size_type capacity) : capacity_(capacity) {} + lru_cache() : capacity_(0) {} + + [[nodiscard]] size_type size() const noexcept { + return map_.size(); + } + [[nodiscard]] size_type max_size() const noexcept { + return capacity_; + } + [[nodiscard]] bool empty() const noexcept { + return vlist_.empty(); + } + + void resize(size_type new_capacity) { + capacity_ = new_capacity; + trim(); + } + + list_iter begin() noexcept { + return vlist_.begin(); + } + const_list_iter begin() const noexcept { + return vlist_.begin(); + } + list_iter end() noexcept { + return vlist_.end(); + } + const_list_iter end() const noexcept { + return vlist_.end(); + } + + void clear() noexcept { + map_.clear(); + vlist_.clear(); + } + + void swap(lru_cache& other) noexcept { + using std::swap; + swap(vlist_, other.vlist_); + swap(map_, other.map_); + swap(capacity_, other.capacity_); + } + + list_iter find(const key_t& key) { + auto it = map_.find(key); + if (it == map_.end()) + return end(); + vlist_.splice(vlist_.begin(), vlist_, it->second); + return it->second; + } + + std::pair insert(const value_type& value) { + auto it = map_.find(value.first); + if (it != map_.end()) { + // Move existing to front + vlist_.splice(vlist_.begin(), vlist_, it->second); + return {it->second, false}; + } + + // Insert new at front + vlist_.emplace_front(value); + map_[value.first] = vlist_.begin(); + + trim(); + + return {vlist_.begin(), true}; + } + + list_iter erase(list_iter pos) { + map_.erase(pos->first); + return vlist_.erase(pos); + } + + private: + void trim() { + while (map_.size() > capacity_) { + auto last = std::prev(vlist_.end()); + map_.erase(last->first); + vlist_.pop_back(); + } + } + + list_type vlist_; + map_type map_; + size_type capacity_; +}; + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Utils.h new file mode 100644 index 0000000000000000000000000000000000000000..d1378f0f11c1d77227cc12daadbfa5e1e1e61e87 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Utils.h @@ -0,0 +1,172 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define ONEDNN_SUPPORT_DETERMINISTIC \ + (DNNL_VERSION_MAJOR >= 3 && DNNL_VERSION_MINOR >= 4) + +namespace at::native::onednn { + +dnnl::memory::format_tag get_dnnl_default_format( + int ndims, + bool is_channels_last = false, + bool allow_undef = false); + +dnnl::memory::data_type get_onednn_dtype( + const at::Tensor& tensor, + bool allow_undef = false); + +dnnl::memory::data_type get_onednn_dtype_include_double( + const at::Tensor& tensor, + bool allow_undef = false); + +bool is_supported_onednn_dtype(const at::Tensor& tensor); + +dnnl::memory::dims get_onednn_dims(const at::Tensor& tensor); + +dnnl::memory::dims get_onednn_strides(const at::Tensor& tensor); +dnnl::memory::desc get_onednn_md(const at::Tensor& tensor); + +bool onednn_strides_check(const at::Tensor& src); +bool is_broadcast(const at::Tensor& t); +void undo_broadcast_on_batch(at::Tensor& m1, at::Tensor& m2); +void undo_broadcast(at::Tensor& tensor); + +bool is_onednn_matmul_strides(const at::Tensor& tensor); + +bool is_broadcast_from_other_to_self( + const at::Tensor& self, + const at::Tensor& other); + +at::MemoryFormat get_cl_tag_by_ndim(const int64_t ndim); + +void apply_tf32_if_allowed(dnnl::primitive_attr& primitive_attr); + +bool binary_valid( + const at::Tensor& self, + const at::Tensor& other, + bool is_fusion = false); + +bool use_channels_last_for_conv( + const at::Tensor& src, + const at::Tensor& weight); + +dnnl::memory::format_tag conv_src_fmt( + const int64_t ndim, + const bool is_channels_last = false); + +dnnl::memory::dims compatible_weight_dims( + const int64_t ndim, + const int64_t groups, + const int64_t oc, + const int64_t ic, + const IntArrayRef wsizes); + +dnnl::memory::format_tag conv_weight_fmt( + const int64_t ndim, + const bool grouped = false, + const bool is_channels_last = false); + +template +dnnl::memory::dims compatible_dilation(Vec&& dilation) { + dnnl::memory::dims ret = dilation.vec(); + for (auto it = ret.begin(); it != ret.end(); it++) { + *it -= 1; + } + return ret; +} + +inline std::vector padding_r( + IntArrayRef padding, + IntArrayRef output_padding) { + // ConvTranspose padding adjustment + // + // PyTorch uses padding/output_padding: + // osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + // + output_padding + 1 + // + // MKLDNN uses padding_l/padding_r: + // osize = (isize - 1) * stride - padding_l - padding_r + dilation * + // (kernel_size - 1) + 1 + // + // So: padding_l = padding, padding_r = padding - output_padding + // + auto dim = padding.size(); + std::vector pad_r(dim); + for (const auto d : c10::irange(dim)) { + pad_r[d] = padding[d] - output_padding[d]; + } + return pad_r; +} + +template +dnnl::memory dnnl_memory_from_host_scalar( + T host_value, + Tensor& holder, + dnnl::engine& engine) { + auto options = at::TensorOptions() + .dtype(c10::CppTypeToScalarType::value) + .device(kXPU); + holder = at::empty({1}, options).fill_(host_value); + dnnl::memory::desc md = get_onednn_md(holder); + dnnl::memory mem = make_onednn_memory(md, engine, holder.data_ptr()); + return mem; +} + +struct PartitionCache { + std::unordered_map, dnnl::graph::partition> partition_map_{}; + + // The first 8 bits are reserved + // bit 0: is int8 + // bit 1: is uint8 + // bit 2: fp16(0) / bf16(1) + // bit 3: is fp32 + // bit 4: is sdpa pattern + // bit 5: is sdpa backward pattern + // bit 6-7: reserved for future use + // The rest of the bits depend upon the arguments provided + // However, down the line, we might have different bitsets for different + // patterns + enum class BitType : uint8_t { + Int8 = 0, + Uint8 = 1, + Bfloat16 = 2, + Float32 = 3, + SdpaPattern = 4, + SdpaBwdPattern = 5 + }; + + dnnl::graph::partition& insert_partition_cache( + std::bitset<32>& patternID, + dnnl::graph::partition& p) { + partition_map_[patternID] = std::move(p); + return partition_map_[patternID]; + } + std::optional> find_partition( + std::bitset<32>& patternID) { + auto iter = partition_map_.find(patternID); + if (iter != partition_map_.end()) { + return iter->second; + } + return std::nullopt; + } +}; + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNN.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNN.h new file mode 100644 index 0000000000000000000000000000000000000000..ba8983ec0b36d54ca060503b4dade1a73db0a395 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNN.h @@ -0,0 +1,225 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace at::native::onednn { + +TORCH_API sycl::event matmul( + at::Tensor& result, + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Tensor& b_raw, + bool m2_trans, + Attr attr, + const std::vector& deps = {}); + +TORCH_API sycl::event convolution( + at::Tensor& dst, + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& bia, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + Attr& attr, + const std::vector& deps = {}); + +TORCH_API sycl::event convolution_backward_weights( + at::Tensor& diff_weight, + at::Tensor& diff_bia, + const at::Tensor& diff_dst, + const at::Tensor& src, + IntArrayRef diff_weight_aten_size, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + const std::vector& deps = {}); + +TORCH_API sycl::event convolution_backward_data( + at::Tensor& diff_src, + const at::Tensor& diff_dst, + const at::Tensor& weight, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool bias_defined, + const std::vector& deps = {}); + +TORCH_API sycl::event deconvolution( + at::Tensor& dst, + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& bia, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dst_padding, + IntArrayRef dilation, + int64_t groups, + Attr& attr, + const std::vector& deps = {}); + +TORCH_API sycl::event deconvolution_backward_data( + at::Tensor& diff_src, + const at::Tensor& diff_dst, + const at::Tensor& weight, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dst_padding, + IntArrayRef dilation, + int64_t groups, + bool bias_defined, + const std::vector& deps = {}); + +TORCH_API sycl::event deconvolution_backward_weights( + at::Tensor& diff_weight, + at::Tensor& diff_bia, + const at::Tensor& diff_dst, + const at::Tensor& src, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dst_padding, + IntArrayRef dilation, + int64_t groups, + const std::vector& deps = {}); + +TORCH_API void woq_matmul_int4( + at::Tensor& result, // dst, [M, N] + const at::Tensor& mat1_, // src, [M, K] + const at::Tensor& mat2_, // quantized weight, [K/8, N] + const at::Tensor& scale, // [K/group_size, N] + const at::Tensor& zp, // [k/group_size, N] + int64_t group_size, + bool pri_cache = true); + +dnnl::memory::dims conv_dst_size( + int64_t ndim, + IntArrayRef src_tz, + IntArrayRef wgh_tz, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation); + +dnnl::memory::dims deconv_dst_size( + IntArrayRef src_size, + IntArrayRef wgh_size, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + IntArrayRef dst_padding, + int64_t groups); + +at::Tensor quantized_convolution( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + bool transposed, + int64_t groups, + at::Tensor output, + double inv_output_scale, + int64_t output_zero_point, + std::optional accum, + double accum_scale, + int64_t accum_zero_point, + std::optional output_dtype, + std::optional binary_attr, + std::optional binary_alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + +void quantized_matmul( + at::Tensor mat1, // act + double input_scale, + int64_t input_zero_point, + at::Tensor mat2, // weight + at::Tensor& weight_scales, + at::Tensor& weight_zero_points, + at::Tensor& b_raw, + at::Tensor result, // output + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::optional other, // extra input for binary-post-op + double other_scale, + int64_t other_zero_point, + const std::string_view& binary_post_op, + double binary_alpha, + const std::string_view& unary_post_op, + torch::List>& unary_post_op_args, + std::string_view unary_post_op_algorithm, + bool m2_trnas); + +void sdpa( + int batch_size, + int seq_len_q, + int seq_len_kv, + int num_head_q, + int num_head_kv, + int head_dim_qk, + int head_dim_v, + const Tensor& query, + const Tensor& key, + const Tensor& value, + std::optional attn_mask, + bool is_causal, + float softmax_scale, + const Tensor& attention, + bool compute_logsumexp, + const Tensor& logsumexp); + +void sdpa_backward( + int batch_size, + int num_head_q, + int num_head_kv, + int seq_len_q, + int seq_len_kv, + int head_dim_qk, + int head_dim_v, + const Tensor& grad_out, + const Tensor& query, + const Tensor& key, + const Tensor& value, + const Tensor& out, + const Tensor& logsumexp, + std::optional attn_mask, + bool is_causal, + double scale, + Tensor& grad_query, + Tensor& grad_key, + Tensor& grad_value); + +sycl::event scaled_matmul( + const Tensor& mat1, + const Tensor& mat2, + Tensor& result, + const Tensor& scale_a, + const Tensor& scale_b, + at::blas::ScalingType scaling_choice_a, + at::blas::ScalingType scaling_choice_b, + const std::optional& bias, + const std::optional& scale_result, + bool use_fast_accum); +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNNContext.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNNContext.h new file mode 100644 index 0000000000000000000000000000000000000000..9c195f19089bd945dcc66eb4f60a0aa3a5e3acfc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNNContext.h @@ -0,0 +1,95 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace at::native::onednn { + +TORCH_XPU_API dnnl::memory make_onednn_memory( + dnnl::memory::desc md, + dnnl::engine& engine, + void* ptr); + +// Keep non-static and non-inline +bool set_onednn_verbose(int level); + +// GpuEngineManager singleton +struct TORCH_XPU_API GpuEngineManager { + static GpuEngineManager& Instance(); // Singleton + + dnnl::engine& get_engine( + DeviceIndex device_index = c10::xpu::current_device()) { + c10::xpu::check_device_index(device_index); + return *engine_pool[device_index]; + } + + dnnl::engine& get_engine(const Device& device) { + TORCH_INTERNAL_ASSERT(device.type() == kXPU); + return get_engine(device.index()); + } + + GpuEngineManager(GpuEngineManager const&) = delete; + GpuEngineManager& operator=(GpuEngineManager const&) = delete; + GpuEngineManager(GpuEngineManager&&) = default; + GpuEngineManager& operator=(GpuEngineManager&&) = default; + + protected: + GpuEngineManager(); + ~GpuEngineManager() = default; + + private: + std::vector> engine_pool; +}; + +// GpuStreamManager singleton +struct TORCH_XPU_API GpuStreamManager { + static GpuStreamManager& Instance(); // Singleton + + dnnl::stream& get_stream( + DeviceIndex device_index = c10::xpu::current_device()) { + auto stream = c10::xpu::getCurrentXPUStream(device_index); + auto priority = stream.priority(); + if (stream_pool[device_index][priority].find(stream) == + stream_pool[device_index][priority].end()) { + stream_pool[device_index][priority][stream] = + std::make_shared(dnnl::sycl_interop::make_stream( + GpuEngineManager::Instance().get_engine(device_index), + stream.queue())); + } + return *stream_pool[device_index][priority][stream]; + } + + GpuStreamManager(GpuStreamManager const&) = delete; + GpuStreamManager& operator=(GpuStreamManager const&) = delete; + GpuStreamManager(GpuStreamManager&&) = default; + GpuStreamManager& operator=(GpuStreamManager&&) = default; + + protected: + GpuStreamManager() { + c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero(); + stream_pool.resize(device_count); + } + ~GpuStreamManager() = default; + + private: + using stream_hash_map = + ska::flat_hash_map>; + std::vector< + std::array> + stream_pool; +}; + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qconv.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qconv.h new file mode 100644 index 0000000000000000000000000000000000000000..64f87b3bf8435db713d71dc72c2ff2a15d75e99c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qconv.h @@ -0,0 +1,115 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native::xpu { +class QConvoneDNNXPU final { + public: + C10_API static at::Tensor run_pointwise( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double inv_output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_binary( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + C10_API static at::Tensor run_pointwise_binary_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + static inline c10::ScalarType qconv_decide_out_dtype( + const at::Tensor& act, + const std::optional output_dtype); + + static at::Tensor qconv_prepack_xpu( + at::Tensor weight, + at::Tensor weight_scales, + double input_scale, + int64_t input_zero_point, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + std::optional> input_shape); +}; + +} // namespace at::native::xpu +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qlinear.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qlinear.h new file mode 100644 index 0000000000000000000000000000000000000000..fd591eb16a16481c7f58b101ba675e74e720e4b0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qlinear.h @@ -0,0 +1,96 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native::xpu { + +class QLinearOnednnXPU final { + public: + C10_API static Tensor q_linear_pointwise( + Tensor act, + double act_scale, + int64_t act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + torch::List> post_op_args, + std::string_view post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_tensor( + Tensor act, + Tensor act_scale, + Tensor act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + torch::List> post_op_args, + std::string_view post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_binary( + Tensor act, + double act_scale, + int64_t act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, + double binary_alpha, + std::string_view unary_post_op, + torch::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_binary_tensor( + Tensor act, + Tensor act_scale, + Tensor act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, + double binary_alpha, + std::string_view unary_post_op, + torch::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); + + C10_API static Tensor q_linear_prepack_onednn( + at::Tensor weight, + std::optional> input_shape); + + static inline c10::ScalarType qlinear_decide_out_dtype( + const at::Tensor& act, + const std::optional output_dtype); + +}; // class QLinearOnednnXPU + +} // namespace at::native::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/Copy.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..69caaa34e35c541dd0c1e8919817947e3adad399 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/Copy.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once +#include + +namespace at::native::mps { + +at::Tensor& mps_copy_( + at::Tensor& dst, + const at::Tensor& src, + bool non_blocking); +void copy_blit_mps(void* dst, const void* src, size_t size); + +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/MPSGraphSequoiaOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/MPSGraphSequoiaOps.h new file mode 100644 index 0000000000000000000000000000000000000000..327d5cab3dd7e6a1c3f85ad97c1b2c4284d6fb00 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/MPSGraphSequoiaOps.h @@ -0,0 +1,46 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#if !defined(__MAC_15_0) && (!defined(MAC_OS_X_VERSION_15_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_15_0)) + +@interface MPSNDArrayIdentity : MPSNDArrayUnaryKernel +- (MPSNDArray* __nullable)reshapeWithCommandBuffer:(__nullable id)cmdBuf + sourceArray:(MPSNDArray* __nonnull)sourceArray + shape:(MPSShape* __nonnull)shape + destinationArray:(MPSNDArray* __nullable)destinationArray; +@end + +@interface MPSNDArrayDescriptor () +@property(readwrite, nonatomic) BOOL preferPackedRows; +@end + +@interface MPSNDArray () +- (nonnull instancetype)initWithBuffer:(id _Nonnull)buffer + offset:(NSUInteger)offset + descriptor:(MPSNDArrayDescriptor* _Nonnull)descriptor; +- (MPSNDArray* __nullable)arrayViewWithShape:(MPSShape* _Nullable)shape strides:(MPSShape* _Nonnull)strides; +@end + +typedef NS_ENUM(NSInteger, MTLMathMode) { + MTLMathModeSafe = 0, + MTLMathModeRelaxed = 1, + MTLMathModeFast = 2, +}; + +typedef NS_ENUM(NSInteger, MTLMathFloatingPointFunctions) { + MTLMathFloatingPointFunctionsFast = 0, + MTLMathFloatingPointFunctionsPrecise = 1, +}; + +@interface MTLCompileOptions () +@property(readwrite, nonatomic) MTLMathMode mathMode; +@property(readwrite, nonatomic) MTLMathFloatingPointFunctions mathFloatingPointFunctions; +@end + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/MetalShaderLibrary.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/MetalShaderLibrary.h new file mode 100644 index 0000000000000000000000000000000000000000..7e7fbc09d74f99130919dd3b5c5467b9006ead14 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/MetalShaderLibrary.h @@ -0,0 +1,202 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#ifdef __OBJC__ +#include +typedef id MTLLibrary_t; +typedef id MTLFunction_t; +typedef id MTLComputePipelineState_t; +typedef id MTLComputeCommandEncoder_t; +#else +typedef void MTLCompileOptions; +typedef void* MTLLibrary_t; +typedef void* MTLFunction_t; +typedef void* MTLComputePipelineState_t; +typedef void* MTLComputeCommandEncoder_t; +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +// Forward declaration of TensorBase and TensorIteratorBase +namespace at { +class TensorBase; +struct TensorIteratorBase; +} // namespace at + +namespace at::native::mps { + +namespace detail { +template +class has_size_type { + template + static constexpr std::true_type check(typename U::size_type*); + template + static constexpr std::false_type check(...); + + public: + static constexpr bool value = decltype(check(nullptr))::value; +}; + +template +constexpr bool has_size_type_v = has_size_type::value; + +} // namespace detail + +// Returns `gpuAddress` of respective `id` plus storage offset +void* get_tensor_gpu_address(const at::TensorBase&); + +class MetalKernelFunction { + public: + MetalKernelFunction(MTLComputePipelineState_t cps_, MTLFunction_t f_); + ~MetalKernelFunction(); + MetalKernelFunction(MetalKernelFunction&) = delete; + // Shader properties + uint64_t getMaxThreadsPerThreadgroup() const; + uint64_t getThreadExecutionWidth() const; + uint64_t getStaticThreadGroupMemoryLength() const; + void runCommandBlock(std::function f); + // Methods below should be called from runCommandBlock function + void startEncoding(); + void setArg(unsigned idx, const at::TensorBase& t); + void setArg(unsigned idx, const void* ptr, uint64_t size); + template < + typename T, + typename = std::enable_if_t< + std::is_integral_v || std::is_same_v || + (std::is_class_v && std::is_trivially_copyable_v && + !detail::has_size_type_v)>> + inline void setArg(unsigned idx, const T val) { + setArg(idx, &val, sizeof(T)); + } + + template < + typename Container, + typename = std::enable_if_t>> + inline void setArg(unsigned idx, const Container& values) { + setArg( + idx, + values.data(), + values.size() * sizeof(typename Container::value_type)); + } + void dispatch( + uint64_t length, + std::optional groupSize = std::nullopt); + void dispatch( + c10::ArrayRef length, + c10::OptionalArrayRef groupSize = std::nullopt); + + private: + MTLComputePipelineState_t cps; + MTLFunction_t func; + MTLComputeCommandEncoder_t encoder = nullptr; +}; + +class MetalShaderLibrary { + public: + MetalShaderLibrary(std::string src) + : shaderSource(std::move(src)), nparams(0), compile_options(nullptr) {} + MetalShaderLibrary(std::string src, unsigned nparams_) + : shaderSource(std::move(src)), + nparams(nparams_), + compile_options(nullptr) {} + MetalShaderLibrary( + std::string src, + unsigned nparams_, + MTLCompileOptions* compile_options_) + : shaderSource(std::move(src)), + nparams(nparams_), + compile_options(compile_options_) {} + MetalShaderLibrary(const MetalShaderLibrary&) = delete; + virtual ~MetalShaderLibrary(); + std::vector getFunctionNames(); + std::shared_ptr getKernelFunction( + const std::string& name); + // Returns a raw pointer to the kernel function for use in C APIs + MetalKernelFunction* getCachedKernelFunctionPtr(const std::string& name); + inline MTLComputePipelineState_t getPipelineStateForFunc( + const std::string& fname) { + return getLibraryPipelineState(getLibrary(), fname).first; + } + MTLComputePipelineState_t getPipelineStateForFunc( + const std::string& fname, + const std::initializer_list& params) { + return getLibraryPipelineState(getLibrary(params), fname).first; + } + inline MTLFunction_t getMTLFunction(const std::string& fname) { + return getLibraryPipelineState(getLibrary(), fname).second; + } + MTLFunction_t getMTLFunction( + const std::string& fname, + const std::initializer_list& params) { + return getLibraryPipelineState(getLibrary(params), fname).second; + } + static MetalShaderLibrary& getBundledLibrary(); + void exec_unary_kernel( + TensorIteratorBase& iter, + const std::string& name, + const std::optional alpha = std::nullopt, + const std::optional scalar_arg_type = std::nullopt); + void exec_binary_kernel( + TensorIteratorBase& iter, + const std::string& name, + const std::optional alpha = std::nullopt, + const std::optional scalar_arg_type = std::nullopt); + void exec_ternary_kernel(TensorIteratorBase& iter, const std::string& name); + + template + void exec_unary_kernel_with_params( + TensorIteratorBase& iter, + const std::string& name, + T params, + const std::string& params_type_name); + template + void exec_binary_kernel_with_params( + TensorIteratorBase& iter, + const std::string& name, + T params, + const std::string& params_type_name); + + protected: + virtual MTLLibrary_t getLibrary(); + virtual MTLLibrary_t getLibrary( + const std::initializer_list& params); + MTLLibrary_t library = nullptr; + + private: + std::pair getLibraryPipelineState( + MTLLibrary_t lib, + const std::string& fname); + MTLLibrary_t compileLibrary(const std::string& src); + std::string shaderSource; + unsigned nparams; + MTLCompileOptions* compile_options; + std::unordered_map libMap; + std::unordered_map< + std::string, + std::pair> + cplMap; + // Cache for kernel functions returned by getCachedKernelFunctionPtr + std::unordered_map> + kernelCache; +}; + +class DynamicMetalShaderLibrary : public MetalShaderLibrary { + public: + DynamicMetalShaderLibrary(const std::string& src) : MetalShaderLibrary(src) { + // Compile right away + getLibrary(); + } + ~DynamicMetalShaderLibrary() override; +}; + +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/OperationUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/OperationUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..afb6f3a2737069a80c943685ea1a34d9b2a0bd02 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/OperationUtils.h @@ -0,0 +1,801 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + +#include + +@interface MPSGraph (PyTorchFixups) +- (MPSGraphTensor*)minimumWithNaNPropagationAndIntFallbackWithPrimaryTensor:(MPSGraphTensor*)primaryTensor + secondaryTensor:(MPSGraphTensor*)secondaryTensor + name:(NSString*)name; + +- (MPSGraphTensor*)maximumWithNaNPropagationAndIntFallbackWithPrimaryTensor:(MPSGraphTensor*)primaryTensor + secondaryTensor:(MPSGraphTensor*)secondaryTensor + name:(NSString*)name; +@end + +using namespace at::mps; + +namespace at::native::mps { + +struct MPSScalar { + id getMTLBuffer() const { + return __builtin_bit_cast(id, buffer.get()); + } + + size_t size = 0; + ScalarType type = ScalarType::Undefined; + c10::DataPtr buffer; // stores MTLBuffer (frees buffer if MPSScalar instance goes out of scope) + union { + float f; // MPS doesn't support 'double' + at::Half h; + int64_t i; + bool b; + c10::complex cf; + c10::complex ch; + at::BFloat16 bf16; + } value{}; +}; + +void runMPSGraph(MPSStream* mpsStream, MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results); + +MPSDataType getMPSDataType(ScalarType scalar_type); +static inline MPSDataType getMPSDataType(const TensorBase& t) { + return getMPSDataType(t.scalar_type()); +} +MPSDataType getMPSScalarType(ScalarType scalar_type); +static inline MPSDataType getMPSScalarType(const TensorBase& t) { + return getMPSScalarType(t.scalar_type()); +} +MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type); +std::string getMPSTypeString(ScalarType scalar_type, bool short_name = false); +static inline std::string getMPSTypeString(const TensorBase& t, bool short_name = false) { + return getMPSTypeString(t.scalar_type(), short_name); +} +std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type); +static inline std::string scalarToMetalTypeString(const TensorBase& t) { + return scalarToMetalTypeString(t.scalar_type()); +} +NSArray* getTensorAxes(const TensorBase& t); +NSArray* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim); +std::string getMPSShapeString(MPSShape* shape); +std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false); +std::string to_hex_key(float); +std::string getArrayRefString(const IntArrayRef s); +// use has_storage() on the returned tensor to determine if src actually is a view +Tensor gatherViewTensor(const Tensor& src, Tensor& dst); +Tensor& scatterViewTensor(const Tensor& src, Tensor& output); +MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input); +MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input); + +MPSNDArray* getStridedMPSNDArray(const TensorBase& src, MPSNDArray* srcNDArray); +MPSNDArray* getMPSNDArray(const TensorBase& t, const IntArrayRef& sizes = {}, const IntArrayRef& strides = {}); +MPSNDArray* getMPSNDArray(const TensorBase& t, MPSShape* sizes = nil, MPSShape* strides = nil); +// The MPSShape could vary based on memory format +Tensor getTensorView(const Tensor& t, MPSShape* shape); +MPSShape* getMPSShape(const TensorBase& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous); +MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous); + +// Determines whether a tensor is too large to use MPSGraph +bool isTooLargeForMPSGraph(const Tensor& tensor, bool useMPSStridedAPI = true); + +static inline id getMTLBufferStorage(const TensorBase& tensor) { + return __builtin_bit_cast(id, tensor.storage().data()); +} + +class Placeholder { + public: + Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {} + Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {} + Placeholder(MPSGraphTensor* mpsGraphTensor, MPSNDArray* mpsNDArray); + Placeholder(MPSGraphTensor* mpsGraphTensor, + const Tensor& self, + MPSShape* mpsShape = nullptr, + bool gatherTensorData = true, + MPSDataType dataType = MPSDataTypeInvalid, + bool useMPSStridedAPI = true); + MPSGraphTensor* getMPSGraphTensor() { + return _placeholder; + } + MPSGraphTensorData* getMPSGraphTensorData() { + return _value; + } + bool isIntermediate() { + return _value == nullptr; + } + + private: + MPSGraphTensor* _placeholder; + MPSGraphTensorData* _value; + Tensor _tensor; +}; + +void resize_tensor(Tensor* output); +Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device); +MPSGraphTensor* convertNHWCtoNCHW(MPSGraph* mpsGraph, MPSGraphTensor* tensor); +MPSGraphTensor* castMPSTensor(MPSGraph* mpsGraph, MPSGraphTensor* tensor, ScalarType toType); +MPSGraphTensor* castMPSTensor(MPSGraph* mpsGraph, MPSGraphTensor* tensor, MPSDataType toType); +MPSGraphTensorData* getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const TensorBase& tensor); +MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar); + +MPSGraph* make_mps_graph(); + +MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType); +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType, MPSShape* mpsShape); +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph* mpsGraph, const TensorBase& tensor); +MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType); +MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, const Scalar& scalar); + +std::string get_mem_format_string(c10::MemoryFormat memory_format); + +using MPSCacheKey = uint64_t; + +struct MPSCachedKernel { + MPSCachedKernel(NSObject* object) : _object([object retain]) {} + virtual ~MPSCachedKernel() { + [_object release]; + _object = nullptr; + } + + // Delete copy constructor and assignment + MPSCachedKernel(const MPSCachedKernel&) = delete; + void operator=(const MPSCachedKernel&) = delete; + + template + inline T* kernel() const { + return (T*)_object; + } + + private: + NSObject* _object = nullptr; +}; + +// derive this class to cache a graph and its inputs/outputs +// can be used to store any NSObject +struct MPSCachedGraph { + MPSCachedGraph(NSObject* object) : _object([object retain]) {} + virtual ~MPSCachedGraph() { + [_object release]; + _object = nullptr; + } + + template + inline T* as() { + return static_cast(this); + } + + MPSGraph* graph() const { + return (MPSGraph*)_object; + } + NSObject* object() const { + return _object; + } + + private: + NSObject* _object = nullptr; +}; + +struct MPSUnaryCachedGraph : public MPSCachedGraph { + MPSUnaryCachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; +}; + +struct MPSUnaryGradCachedGraph : public MPSCachedGraph { + MPSUnaryGradCachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; // some backward input is actually the forward's output + MPSGraphTensor* gradInputTensor_ = nil; +}; + +struct MPSBinaryCachedGraph : public MPSCachedGraph { + MPSBinaryCachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* otherTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; +}; + +struct MPSBinaryGradCachedGraph : public MPSCachedGraph { + MPSBinaryGradCachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* otherTensor_ = nil; + MPSGraphTensor* gradInputTensor_ = nil; +}; + +struct MPSKernelCache { + typedef MPSCachedKernel* (^CreateCachedKernelBlock)(); + + struct CacheEntry { + CacheEntry(const std::string& key, MPSCachedKernel* cachedKernel) : cachedKernel_(cachedKernel), key_(key) {} + MPSCachedKernel* cachedKernel_ = nullptr; + std::string key_; + }; + + public: + static MPSKernelCache* getInstance() { + if (_instance_cache == nullptr) { + _instance_cache = new MPSKernelCache(); + } + return _instance_cache; + } + + ~MPSKernelCache() { + dispatch_release(serialQueue_); + for (const auto& i : cache_) { + delete i.second.cachedKernel_; + } + } + + // Disallow the copy constructor and operator= functions + MPSKernelCache(const MPSKernelCache&) = delete; + void operator=(const MPSKernelCache&) = delete; + + MPSCachedKernel* CreateCachedKernel(const std::string& key, CreateCachedKernelBlock createCacheBlock) { + __block MPSCachedKernel* cachedKernel = nil; + MPSCacheKey hash = std::hash{}(key); + dispatch_sync_with_rethrow(serialQueue_, ^() { + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached kernel!\n"); + cachedKernel = entry.cachedKernel_; + } else { + cachedKernel = createCacheBlock(); + CacheEntry entry(key, cachedKernel); + cache_.emplace(hash, entry); + } + }); + return cachedKernel; + } + template + inline T* CreateCachedKernelAs(const std::string& key, CreateCachedKernelBlock createCacheBlock) { + return static_cast(CreateCachedKernel(key, createCacheBlock)); + } + + MPSCachedKernel* LookUp(const std::string& key) const { + __block MPSCachedKernel* cachedKernel = nil; + + MPSCacheKey hash = std::hash{}(key); + dispatch_sync_with_rethrow(serialQueue_, ^() { + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached kernel!\n"); + cachedKernel = entry.cachedKernel_; + } + }); + return cachedKernel; + } + + template + inline T* LookUpAs(const std::string& key) const { + return static_cast(LookUp(key)); + } + + private: + MPSKernelCache() { + serialQueue_ = dispatch_queue_create("kernel cache queue", DISPATCH_QUEUE_SERIAL); + } + + static MPSKernelCache* _instance_cache; + std::unordered_map cache_; + dispatch_queue_t serialQueue_ = nullptr; +}; + +// Common template for creating cached kernel if missing +template +inline T* LookUpOrCreateCachedKernel(const std::string& key, std::function instantiate) { + auto cache_ = MPSKernelCache::getInstance(); + if (auto rc = cache_->LookUpAs(key)) { + return rc; + } + return cache_->CreateCachedKernelAs(key, ^mps::MPSCachedKernel*() { + auto k_ = new mps::MPSCachedKernel(instantiate()); + return k_; + }); +} + +// TODO: Improve the overall design of MPSGraphCache. +// https://github.com/pytorch/pytorch/issues/77176 +// Cache holding various keys mapped to graphs +struct MPSGraphCache { + typedef MPSCachedGraph* (^CreateCachedGraphBlock)(); + + struct CacheEntry { + CacheEntry(const std::string& key, MPSCachedGraph* cachedGraph) : cachedGraph_(cachedGraph), key_(key) {} + MPSCachedGraph* cachedGraph_ = nullptr; + std::string key_; + }; + + public: + static MPSGraphCache* getInstance() { + if (_instance_cache == nullptr) { + _instance_cache = new MPSGraphCache(); + } + return _instance_cache; + } + + ~MPSGraphCache() { + dispatch_release(serialQueue_); + + for (const auto& i : cache_) { + delete i.second.cachedGraph_; + } + } + + // Disallow the copy constructor and operator= functions + MPSGraphCache(const MPSGraphCache&) = delete; + void operator=(const MPSGraphCache&) = delete; + + MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) { + __block MPSCachedGraph* cachedGraph = nil; + + MPSCacheKey hash = std::hash{}(key); + + dispatch_sync_with_rethrow(serialQueue_, ^() { + // verify the cached entry doesn't already exist + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n"); + cachedGraph = entry.cachedGraph_; + } else { + cachedGraph = createCacheBlock(); + CacheEntry entry(key, cachedGraph); + cache_.emplace(hash, entry); + profileCachedGraph(entry); + } + }); + return cachedGraph; + } + + template + inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock) { + return static_cast(CreateCachedGraph(key, createCacheBlock)); + } + + MPSCachedGraph* LookUp(const std::string& key) const { + __block MPSCachedGraph* cachedGraph = nullptr; + + MPSCacheKey hash = std::hash{}(key); + + dispatch_sync(serialQueue_, ^() { + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n"); + cachedGraph = entry.cachedGraph_; + profileCachedGraph(entry); + } + }); + return cachedGraph; + } + + template + inline T* LookUpAs(const std::string& key) const { + return static_cast(LookUp(key)); + } + + private: + MPSGraphCache() { + serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL); + } + // this is defined in OperationUtils.mm to not include + // MPSProfiler.h in header OperationUtils.h + void profileCachedGraph(const CacheEntry& cacheEntry) const; + + static MPSGraphCache* _instance_cache; + std::unordered_map cache_; + dispatch_queue_t serialQueue_ = nullptr; +}; + +// Common template for creating graph with a specified cache if missing +template +inline T* LookUpOrCreateCachedGraph(const std::string& key, std::function instantiate) { + auto cache_ = MPSGraphCache::getInstance(); + if (auto rc = cache_->LookUpAs(key)) { + return rc; + } + return cache_->CreateCachedGraphAs(key, ^mps::MPSCachedGraph*() { + T* newCachedGraph = nil; + @autoreleasepool { + // Initialize graph + auto mpsGraph = mps::make_mps_graph(); + newCachedGraph = new T(mpsGraph); + instantiate(mpsGraph, newCachedGraph); + } + return newCachedGraph; + }); +} + +// Common math operations +MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor); + +/** + * Returns distance from lowest to highest element offset in given tensor. + */ +size_t compute_storage_numel_distance(const TensorBase& t); + +/** + * Checks whether tensor is mapped to a contiguous area in the storage. + */ +inline bool is_dense_in_storage(const TensorBase& t) { + return compute_storage_numel_distance(t) == static_cast(t.numel()); +} + +template , encoder_t> || + std::is_same_v, encoder_t>>> +static inline void mtl_setBuffer(encoder_t encoder, const TensorBase& t, unsigned idx) { + if (C10_UNLIKELY(t.device().type() == kCPU)) { + if constexpr (std::is_same_v, encoder_t>) { + TORCH_CHECK(t.dim() == 0, "Passed CPU tensor to MPS op"); + // MPS does not support doubles, silently downcast CPU scalar to float + if (C10_UNLIKELY(t.scalar_type() == kDouble)) { + auto val = static_cast(*reinterpret_cast(t.const_data_ptr())); + [encoder setBytes:&val length:sizeof(val) atIndex:idx]; + return; + } + if (C10_UNLIKELY(t.scalar_type() == kComplexDouble)) { + auto val = static_cast>(*reinterpret_cast*>(t.const_data_ptr())); + [encoder setBytes:&val length:sizeof(val) atIndex:idx]; + return; + } + [encoder setBytes:t.storage().data() length:t.element_size() atIndex:idx]; + } else { + TORCH_CHECK(false, "Passed CPU tensor to MPS op"); + } + return; + } + [encoder setBuffer:getMTLBufferStorage(t) offset:t.storage_offset() * t.element_size() atIndex:idx]; +} + +// Implementation of setBytes for containers vs trivially copiable types must be separate +// Containers like `std::array` could have been uploaded directly, but `c10::ArrayRef`, +// while trivially copiable, includes padding which if copied as Metal shader parameters +// might overwrite other values +template < + typename T, + typename = std::enable_if_t || std::is_same_v || + (std::is_class_v && std::is_trivially_copyable_v && !detail::has_size_type_v)>> +static inline void mtl_setBytes(id encoder, const T val, unsigned idx) { + [encoder setBytes:&val length:sizeof(T) atIndex:idx]; +} + +template >> +static inline void mtl_setBytes(id encoder, const Container& values, unsigned idx) { + [encoder setBytes:values.data() length:sizeof(typename Container::value_type) * values.size() atIndex:idx]; +} + +static inline void mtl_setBytes(id encoder, const MPSScalar& s, unsigned idx) { + [encoder setBytes:&s.value length:s.size atIndex:idx]; +} + +static size_t iter_tensor_offset(TensorIteratorBase& iter, unsigned idx) { + // At the moment, MPS storage data is not the real GPU pointer, but rather a pointer to id object + // But TensorIterator constructs data_ptr as if base was just a raw pointer + // Workaround this problem by computing an offset from the start of the tensor, which works for both + // tensor views and sliced 64-bit iterators + return reinterpret_cast(iter.data_ptr(idx)) - + reinterpret_cast(iter.tensor_base(idx).storage().data()); +} + +static inline void bind_iter_tensors(id encoder, + TensorIteratorBase& iter, + std::optional ntensors = std::nullopt) { + for (auto idx : c10::irange(ntensors.value_or(iter.ntensors()))) { + auto& t = iter.tensor_base(idx); + // Handle CPU scalars + if (C10_UNLIKELY(t.device().type() == kCPU)) { + mtl_setBuffer(encoder, t, idx); + continue; + } + auto offs = iter_tensor_offset(iter, idx); + [encoder setBuffer:getMTLBufferStorage(t) offset:offs atIndex:idx]; + } +} + +namespace detail { +template +inline void mtl_setArg(id encoder, const T& val, unsigned idx) { + mtl_setBytes(encoder, val, idx); +} + +inline void mtl_setArg(id encoder, id val, unsigned idx) { + [encoder setBuffer:val offset:0 atIndex:idx]; +} + +template <> +inline void mtl_setArg(id encoder, const Tensor& val, unsigned idx) { + mtl_setBuffer(encoder, val, idx); +} + +template <> +inline void mtl_setArg(id encoder, const std::optional& val, unsigned idx) { + if (val.has_value()) { + mtl_setBuffer(encoder, val.value(), idx); + } +} + +template <> +inline void mtl_setArg(id encoder, const TensorBase& val, unsigned idx) { + mtl_setBuffer(encoder, val, idx); +} +// MPS does not support doubles, so cast it down to float before passing as an argument +template <> +inline void mtl_setArg(id encoder, const double& val, unsigned idx) { + float val_f = static_cast(val); + mtl_setBytes(encoder, val_f, idx); +} +} // namespace detail + +template +static inline void mtl_setArgs(id encoder, const T& val) { + detail::mtl_setArg(encoder, val, idx); +} + +template +static inline void mtl_setArgs(id encoder, const T& val, Args&&... args) { + detail::mtl_setArg(encoder, val, idx); + mtl_setArgs(encoder, std::forward(args)...); +} + +static inline void mtl_dispatch1DJob(id encoder, + id cplState, + NSUInteger length) { + static_assert(sizeof(NSUInteger) == sizeof(uint64_t)); + const auto maxThreadsPerGroup = [cplState maxTotalThreadsPerThreadgroup]; + auto size = MTLSizeMake(length, 1, 1); + auto threadGroupSize = MTLSizeMake(std::min(maxThreadsPerGroup, length), 1, 1); + [encoder dispatchThreads:size threadsPerThreadgroup:threadGroupSize]; +} + +id generateKernelDataOffsets(id commandEncoder, + const TensorIteratorBase& iter, + bool use_64bit_index = false); + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1) { + return @{p1.getMPSGraphTensor() : p1.getMPSGraphTensorData()}; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2) { + return @{ + p1.getMPSGraphTensor() : p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor() : p2.getMPSGraphTensorData(), + }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3) { + return @{ + p1.getMPSGraphTensor() : p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor() : p2.getMPSGraphTensorData(), + p3.getMPSGraphTensor() : p3.getMPSGraphTensorData(), + }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3, Placeholder& p4) { + return @{ + p1.getMPSGraphTensor() : p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor() : p2.getMPSGraphTensorData(), + p3.getMPSGraphTensor() : p3.getMPSGraphTensorData(), + p4.getMPSGraphTensor() : p4.getMPSGraphTensorData(), + }; +} + +inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds, Placeholder& result) { + runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result)); +} + +// MPS yet to support double types, but starting from MacOS 14, supports bfloat16 +inline bool supportedFloatingType(ScalarType dtype) { + return dtype == kFloat || dtype == kHalf || dtype == kBFloat16; +} + +inline bool supportedFloatingType(const TensorBase& t) { + return supportedFloatingType(t.scalar_type()); +} + +inline bool supportedFloatingOrComplexType(ScalarType dtype) { + if (dtype == kComplexFloat || dtype == kComplexHalf) { + return true; + } + return supportedFloatingType(dtype); +} +inline bool supportedFloatingOrComplexType(const TensorBase& t) { + return supportedFloatingOrComplexType(t.scalar_type()); +} + +inline bool needsGather(const TensorBase& t) { + static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS); + return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset()); +} + +template +void MetalShaderLibrary::exec_unary_kernel_with_params(TensorIteratorBase& iter, + const std::string& name, + T params, + const std::string& params_type_name) { + using namespace at::mps; + // Decompose 64-bit tensor into 32-bit ones + if (!iter.can_use_32bit_indexing()) { + for (auto&& sub_iter : iter.with_32bit_indexing()) { + exec_unary_kernel_with_params(sub_iter, name, params, params_type_name); + } + return; + } + + auto inputTensor = iter.input(0); + auto outputTensor = iter.output(0); + uint32_t length = iter.numel(); + if (length == 0) { + return; + } + auto kernel_name = fmt::format("{}_{}_{}_{}{}", + name, + iter.is_contiguous() ? "dense" : "strided", + scalarToMetalTypeString(outputTensor), + scalarToMetalTypeString(inputTensor), + fmt::format("_{}", params_type_name)); + @autoreleasepool { + auto cplState = getPipelineStateForFunc(kernel_name); + + MPSStream* mpsStream = getCurrentMPSStream(); + dispatch_sync(mpsStream->queue(), ^() { + auto computeEncoder = mpsStream->commandEncoder(); + + getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor}); + + [computeEncoder setComputePipelineState:cplState]; + bind_iter_tensors(computeEncoder, iter); + if (!iter.is_contiguous()) { + mtl_setArgs<2>(computeEncoder, + outputTensor.sizes(), + inputTensor.strides(), + outputTensor.strides(), + inputTensor.ndimension()); + } + detail::mtl_setArg(computeEncoder, params, iter.is_contiguous() ? 2 : 6); + mtl_dispatch1DJob(computeEncoder, cplState, length); + + getMPSProfiler().endProfileKernel(cplState); + }); + } +} + +template +void MetalShaderLibrary::exec_binary_kernel_with_params(TensorIteratorBase& iter, + const std::string& name, + T params, + const std::string& params_type_name) { + using namespace mps; + // TODO: Figure a better place to downcast double scalars (probably in tensor iterator itself?) + // Right now running something like 1.0-torch.rand(5, device='mps') will create iterator with + // double as common dtype (because Python floating point are always 64-bit values) + TORCH_CHECK(iter.output().scalar_type() != at::kDouble, "float64 is not supported on MPS"); + + // Skip for empty iterators + if (iter.numel() == 0) { + return; + } + + // Decompose 64-bit tensor into 32-bit ones + if (!iter.can_use_32bit_indexing()) { + for (auto&& sub_iter : iter.with_32bit_indexing()) { + exec_binary_kernel_with_params(sub_iter, name, params, params_type_name); + } + return; + } + + auto convert_double_scalar = [](Tensor& t) { + if (t.dim() != 0) { + return; + } + if (t.scalar_type() == kDouble) { + t = t.to(kFloat); + } else if (t.scalar_type() == kComplexDouble) { + t = t.to(kComplexFloat); + } + }; + + Tensor input = iter.input(0); + Tensor other = iter.input(1); + Tensor out = iter.output(); + + convert_double_scalar(input); + convert_double_scalar(other); + + MPSStream* mpsStream = getCurrentMPSStream(); + const auto cast_needed = input.scalar_type() != other.scalar_type(); + const auto suffix = iter.is_contiguous() ? "dense" : "strided"; + // TODO: Implicitly pass both input and output types to non-cast kernels + const auto kernel_name = cast_needed + ? fmt::format("{}_{}_cast_{}_{}", name, suffix, scalarToMetalTypeString(out), params_type_name) + : fmt::format("{}_{}_{}_{}_{}", + name, + suffix, + scalarToMetalTypeString(out), + scalarToMetalTypeString(input), + params_type_name); + dispatch_sync_with_rethrow(mpsStream->queue(), ^() { + @autoreleasepool { + auto computeEncoder = mpsStream->commandEncoder(); + auto binaryPSO = getPipelineStateForFunc(kernel_name); + // this function call is a no-op if MPS Profiler is not enabled + getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other}); + [computeEncoder setComputePipelineState:binaryPSO]; + // Set input and output tensors + bind_iter_tensors(computeEncoder, iter); + // Iterator is contiguous if all of its elements are dense in storage, + // i.e. it's true for both row-first and column-first tensors + if (iter.is_contiguous()) { + detail::mtl_setArg(computeEncoder, params, 3); + if (cast_needed) { + std::array size_and_types = {static_cast(c10::elementSize(input.scalar_type())), + static_cast(c10::elementSize(other.scalar_type())), + static_cast(input.scalar_type()), + static_cast(other.scalar_type())}; + mtl_setBytes(computeEncoder, size_and_types, 4); + } + } else { + // Please note that shapes and strides of the iterator might be + // different than that of its operands, for example binary op + // between 4x4 tensor and scalar will result in 1D 16 element iterator + std::array ndim_and_types = {iter.ndim(), + static_cast(input.scalar_type()), + static_cast(other.scalar_type()), + static_cast(out.scalar_type())}; + mtl_setArgs<3>( + computeEncoder, params, iter.shape(), iter.strides(0), iter.strides(1), iter.strides(2), ndim_and_types); + } + mtl_dispatch1DJob(computeEncoder, binaryPSO, iter.numel()); + getMPSProfiler().endProfileKernel(binaryPSO); + } + }); +} + +// Checks if one tensor is broadcastable into another +static bool is_dense_broadcastable(const Tensor& from, const Tensor& into) { + if (!from.is_contiguous() || !into.is_contiguous()) { + return false; + } + bool checking_squeezable_dims = false; + for (const auto dim : c10::irange(from.ndimension())) { + if (checking_squeezable_dims) { + if (from.size(-dim - 1) == 1) { + continue; + } + return false; + } + checking_squeezable_dims = from.size(-dim - 1) != into.size(-dim - 1); + } + return true; +} + +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/TensorFactory.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/TensorFactory.h new file mode 100644 index 0000000000000000000000000000000000000000..8f41c9b54e9a1651f5466ef0d0687a922a24a3e5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mps/TensorFactory.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#define AT_DISPATCH_MPS_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) AT_DISPATCH_CASE( \ + at::ScalarType::Half, \ + __VA_ARGS__) AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mtia/EmptyTensor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mtia/EmptyTensor.h new file mode 100644 index 0000000000000000000000000000000000000000..56150f713f660effa21dd39b99231848bdd71e2d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/mtia/EmptyTensor.h @@ -0,0 +1,47 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) + +#pragma once +#include + +namespace at::detail { + +TensorBase empty_mtia( + IntArrayRef size, + ScalarType dtype, + std::optional device_opt, + std::optional memory_format_opt); + +TensorBase empty_mtia( + IntArrayRef size, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); + +TensorBase empty_mtia(IntArrayRef size, const TensorOptions& options); + +TensorBase empty_strided_mtia( + IntArrayRef size, + IntArrayRef stride, + ScalarType dtype, + std::optional device_opt); + +TensorBase empty_strided_mtia( + IntArrayRef size, + IntArrayRef stride, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt); + +TensorBase empty_strided_mtia( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions& options); + +} // namespace at::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..f6c30fe021376ab28fb17c9e7490a500d7a6a199 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h @@ -0,0 +1,23 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +enum class NESTED_DENSE_OP : uint8_t { ADD, MUL }; + +using nested_dense_elementwise_fn = void (*)( + Tensor& result, + const Tensor& self, + const Tensor& other, + const NESTED_DENSE_OP& op); + +DECLARE_DISPATCH(nested_dense_elementwise_fn, nested_dense_elementwise_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h new file mode 100644 index 0000000000000000000000000000000000000000..d90605b91eb7ba0e0f5b415720aa74b0cdb3dacc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h @@ -0,0 +1,84 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +TORCH_API Tensor NestedTensor_to_padded_tensor_generic( + const Tensor& t, + double padding, + OptionalIntArrayRef output_size); + +template +Tensor map_nt(const Tensor& nt, Func f) { + auto* nt_impl = get_nested_tensor_impl(nt); + const auto& sizes = nt_impl->get_nested_sizes(); + return at::detail::make_tensor(f(nt_impl->get_buffer()), sizes); +} +template +Tensor map_nt_binary(const Tensor& nt_1, const Tensor& nt_2, Func f){ + auto* nt_impl_1 = get_nested_tensor_impl(nt_1); + auto* nt_impl_2 = get_nested_tensor_impl(nt_2); + const auto& sizes = nt_impl_1->get_nested_sizes(); + return at::detail::make_tensor(f(nt_impl_1->get_buffer(), nt_impl_2->get_buffer()), sizes); +} + +C10_ALWAYS_INLINE std::pair _check_nested_layer_norm_inputs( + const NestedTensorImpl& input, + IntArrayRef normalized_shape, + const Tensor& weight /* optional */, + const Tensor& bias /* optional */) { + + const size_t normalized_ndim = normalized_shape.size(); + TORCH_CHECK( + normalized_ndim >= 1, + "Expected normalized_shape to be at least 1-dimensional, i.e., ", + "containing at least one element, but got normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !weight.defined() || weight.sizes().equals(normalized_shape), + "Expected weight to be of same shape as normalized_shape, but got ", + "weight of shape ", + weight.sizes(), + " and normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !bias.defined() || bias.sizes().equals(normalized_shape), + "Expected bias to be of same shape as normalized_shape, but got ", + "bias of shape ", + bias.sizes(), + " and normalized_shape = ", + normalized_shape); + + // Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input + // Also, compute M and N considering the idiosyncrasies of NestedTensors + int64_t N = 1; + for (const auto i: c10::irange(normalized_ndim)) { + TORCH_CHECK( + input.opt_size(-normalized_ndim + i).has_value(), + "normalized_shape extends into irregular dimensions for the nested tensor" + ); + TORCH_CHECK( + normalized_shape[i] == input.opt_size(-normalized_ndim + i), + "The shape at dimension ", + i, + "of normalized_shape doesn't match the input" + ); + N *= normalized_shape[i]; + } + + const int64_t M = input.numel() / N; + + return std::make_pair(M, N); +} + +Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..6377cd53b04fa6681ad59e2fa8a2f86a514b55f3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h @@ -0,0 +1,108 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/** + * Transformer-specific NestedTensor utility functions. + * + * Not co-located with NestedTensor core code yet because they only + * support specific cases needed in transformers. + */ +#pragma once + +#include + +#include +#include + +namespace c10 { +class Scalar; +} // namespace c10 + +namespace at { +class Tensor; +namespace native { +struct NestedTensorImpl; + +// Requires that self is a contiguous NestedTensor, other is not a +// NestedTensor, self.dim() == 3, and other.dim() == 2. Also, self +// must have a consistent last dimension across its included Tensors +// and that dimension must match other.size(0). +Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other); + +// Requires that mat1 is a contiguous NestedTensor, self & mat2 are +// not NestedTensors, mat1.dim() == 3, mat2.dim() == 2, and that mat1 +// has a consistent last dimension across its included Tensors that +// matches mat2.size(0). +Tensor NestedTensor_times_Tensor_plus_Tensor_addmm( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const c10::Scalar& beta, + const c10::Scalar& alpha, + std::optional use_gelu = std::nullopt); + +Tensor NestedTensor_add_NestedTensor_in_place( + const Tensor& self, + const Tensor& other); + +TORCH_API Tensor NestedTensor_batch_offsets_from_size_tensor( + const Tensor& sizes, + int64_t extra_elements); + +Tensor NestedTensor_from_padded_tensor_cpu( + const Tensor& padded, + const NestedTensorImpl& nt); + +TORCH_API Tensor NestedTensor_to_mask(const Tensor& nt, std::optional mask_dim, std::optional mask_dim_length); + +template +void remove_padding_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int64_t output_dim, + const int64_t batch_size); + +template +void remove_padding_transform0213_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int64_t output_dim, + const int64_t batch_size); + +template +void add_padding_kernelLauncher( + T* input, + T* output, + T padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + const std::vector& output_sizes, + const int batch_size, + const int output_batch_size); + +TORCH_API Tensor flash_attention_helper( + const Tensor& query, + const Tensor& key, + const Tensor& value, + double dropout_p, + bool need_attn_weights, + bool is_causal); + +TORCH_API std::tuple mem_efficient_helper_nested_unpacked( + const Tensor& query, + const Tensor& key, + const Tensor& value, + double dropout_p, + bool need_attn_weights, + bool is_causal); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..d0f72ac63bcf4f3abbcf35169e4fc6d6b6f6be73 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native::preprocessing { + +/** + * This function will take nested query, key, and value + * and will preprocess it in order to run with either + * the flash-attention or efficient-attention kernels. + * @return A tuple containing all the necessary data for running the fused + * kernels + */ +std::tuple +sdpa_nested_preprocessing( + const Tensor& query, + const Tensor& key, + const Tensor& value); + +/** + * This function will take nested query, key, and value, grad_out, and out + * and will preprocess it in order to run with either + * the flash-attention or efficient-attention kernels backwards. + * We use both functions to avoid having to do the same preprocessing + * for cumulative_sequence_length_q and cumulative_sequence_length_kv + * @return A tuple containing all the necessary data for running the fused + * kernels + */ +std::tuple +sdpa_nested_preprocessing_backward( + const at::Tensor& grad_out_, + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const at::Tensor& out, + const Tensor& cumulative_sequence_length_q, + const Tensor& cumulative_sequence_length_kv, + const int64_t max_seqlen_batch_q, + const int64_t max_seqlen_batch_kv); + +} // namespace at::native::preprocessing + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..155f76d158df37f8202a8892427b08fa8220606d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h @@ -0,0 +1,454 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS + +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include + +namespace at::native { +struct NestedTensorImpl; + +// The following functions are used to construct nested tensors from buffers and +// metadata. + +inline at::Tensor wrap_buffer(const at::Tensor& buffer, const at::Tensor& nested_sizes) { + TORCH_CHECK( + buffer.dim() == 1, + "Expected given buffer to be 1dim, but got ", + buffer.dim(), + " instead."); + TORCH_CHECK( + buffer.is_contiguous(), "Expected given buffer to be contiguous."); + return at::detail::make_tensor( + buffer, nested_sizes); +} + +// TODO: Figure out if we need a non-moving wrap_buffer() +inline at::Tensor wrap_buffer( + const at::Tensor& buffer, + at::Tensor nested_sizes, + at::Tensor nested_strides, + at::Tensor storage_offsets) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + buffer.is_contiguous(), "Given buffer must be contiguous."); + return at::detail::make_tensor( + buffer, + std::move(nested_sizes), + std::move(nested_strides), + std::move(storage_offsets)); +} + +inline at::Tensor get_buffer(const at::Tensor& tensor) { + return get_nested_tensor_impl(tensor)->get_buffer(); +} + +/** + * Create a new nested tensor that is a view of a base nested tensor + * + * create_view_tensor calls a specialized constructor that copies the + * keys from base onto the new view tensor being created. + * The storage is shared between the base and the returned view tensor + * + * All callers of this helper must: + * - Only return a view of the input + * - Must be explicit and define a derivative + * + * @param base Base tensor to construct view from. + * @param nested_sizes View tensors' sizes. + * @param nested_strides View tensors' strides. + * @param storage_offsets View tensors' offsets. + * @return A newly constructed view tensor + */ +inline at::Tensor create_nested_view_tensor( + const at::Tensor& base, + at::Tensor nested_sizes, + at::Tensor nested_strides, + at::Tensor storage_offsets) { + TORCH_INTERNAL_ASSERT( + base.is_nested(), + "This function can only be used to create nested tensor views"); + TORCH_INTERNAL_ASSERT( + c10::impl::tls_local_dispatch_key_set().excluded_.has( + c10::DispatchKey::AutogradFunctionality), + "Creating a non differentiable nested tensor view in a CompositeImplicit function is not allowed."); + return at::detail::make_tensor( + c10::TensorImpl::VIEW, + base, + std::move(nested_sizes), + std::move(nested_strides), + std::move(storage_offsets)); +} +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +// Helper functions for getting information about a nested tensor's shape. + +int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt); + +// The sizes of the underlying tensors +inline std::vector NestedTensor_get_sizes( + const NestedTensorImpl* self_ptr) { + int64_t ntensors = self_ptr->size(0); + std::vector sizes(ntensors); + if (ntensors == 0) { + return sizes; + } + const Tensor& sizemat = self_ptr->get_nested_sizes(); + int64_t orig_dim = sizemat.size(1); + // nesting scalars has empty sizes + if (orig_dim == 0) { + return sizes; + } + const int64_t* sizemat_ptr = sizemat.const_data_ptr(); + + for (const auto i : c10::irange(ntensors)) { + sizes[i] = IntArrayRef(sizemat_ptr, sizemat_ptr + orig_dim); + sizemat_ptr += orig_dim; + } + return sizes; +} + +TORCH_API std::vector NestedTensor_get_max_size( + const NestedTensorImpl& nt); + +std::vector NestedTensor_get_max_size_from_size_tensor( + const Tensor& sizes); + +inline std::vector NestedTensor_get_sizes(const at::Tensor& self) { + const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self); + return NestedTensor_get_sizes(self_ptr); +} +// The strides of the underlying tensors +inline std::vector NestedTensor_get_strides( + const NestedTensorImpl* self_ptr) { + int64_t ntensors = self_ptr->size(0); + std::vector strides(ntensors); + if (ntensors == 0) { + return strides; + } + const Tensor& stridemat = self_ptr->get_nested_strides(); + int64_t orig_dim = stridemat.size(1); + // nesting scalars has empty strides + if (orig_dim == 0) { + return strides; + } + const int64_t* stridemat_ptr = stridemat.const_data_ptr(); + for (const auto i : c10::irange(ntensors)) { + strides[i] = IntArrayRef(stridemat_ptr, stridemat_ptr + orig_dim); + stridemat_ptr += orig_dim; + } + return strides; +} + +inline std::vector NestedTensor_get_strides( + const at::Tensor& self) { + const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self); + return NestedTensor_get_strides(self_ptr); +} + +inline void check_numel_equals_buffer_size(const at::Tensor& self) { + auto self_impl = get_nested_tensor_impl(self); + TORCH_CHECK( + self.numel() == static_cast(self_impl->get_buffer_size()), + "Number of elements in nested tensor must match number of elements in buffer."); +} + +inline void check_numel_equals_buffer_size(const NestedTensorImpl* self_ptr) { + TORCH_CHECK( + self_ptr->numel() == static_cast(self_ptr->get_buffer_size()), + "Number of elements in nested tensor must match number of elements in buffer."); +} + +// Helper function to get size / stride / offset for a nested/normal tensor. +inline IntArrayRef get_size_for_index(const Tensor& tensor, int64_t i) { + if (tensor.is_nested()) { + std::vector tensor_sizes = + NestedTensor_get_sizes(get_nested_tensor_impl(tensor)); + return tensor_sizes[i]; + } else { + return tensor.sizes().slice(1); + } +} + +inline IntArrayRef get_stride_for_index(const Tensor& tensor, int64_t i) { + if (tensor.is_nested()) { + std::vector tensor_strides = + NestedTensor_get_strides(get_nested_tensor_impl(tensor)); + return tensor_strides[i]; + } else { + return tensor.strides().slice(1); + } +} + +inline int64_t get_offset_for_index(const Tensor& tensor, int64_t i) { + if (tensor.is_nested()) { + int64_t* offsets_ptr = get_nested_tensor_impl(tensor) + ->get_storage_offsets() + .data_ptr(); + return offsets_ptr[i]; + + } else { + int64_t offset = tensor.storage_offset(); + return offset + tensor.strides()[0] * i; + } +} +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Data structures and functions for generically applying a function on a nested +// tensor. +namespace impl { + +template +struct NestedNode { + NestedNode() = delete; + explicit NestedNode(std::vector children) + : _is_leaf(false), _children(std::move(children)) {} + explicit NestedNode(TensorList children) + : _is_leaf(false), _children(children.vec()) {} + explicit NestedNode(T payload) + : _is_leaf(true), _payload(std::move(payload)) {} + NestedNode(const NestedNode&) = delete; + NestedNode& operator=(const NestedNode&) = delete; + NestedNode(NestedNode&&) noexcept = default; + NestedNode& operator=(NestedNode&&) noexcept = default; + ~NestedNode() = default; + inline bool is_leaf() const { + return _is_leaf; + } + inline size_t degree() const { + return _children.size(); + } + inline const std::vector unbind() const { + return _children; + } + inline T children(size_t i) const { + return _children[i]; + } + inline const T& payload() const { + return _payload; + } + inline T& payload() { + return _payload; + } + + private: + bool _is_leaf; + std::vector _children; + T _payload{}; +}; + +using TensorNode = NestedNode; + +template +class _map; + +template +class _map> { + public: + static A function_one(const F& fn, const Args&... nested_node) { + return fn(nested_node...); + } + static NestedNode function( + const F& fn, + const NestedNode&... nested_node) { + size_t degree = 0; + bool all_leaf = true; + c10::guts::tuple_map( + std::forward_as_tuple(nested_node...), [&all_leaf, °ree](auto n) { + all_leaf = all_leaf && (n.is_leaf()); + if (degree > 1 && n.degree() > 1) { + TORCH_CHECK( + degree == n.degree(), "NestedNodes must match in degree."); + } + if (n.degree() > degree) { + degree = n.degree(); + } + return nullptr; + }); + // All NestedNodes just wrap regular objects. + if (all_leaf) { + return NestedNode(std::forward(fn)(nested_node.payload()...)); + } + // Some NestedNodes wrap regular Tensors, some NestedTensors and some other + // types. + std::vector result; + for (size_t i = 0; i < degree; i++) { + auto children = c10::guts::tuple_map( + std::forward_as_tuple(nested_node...), [&i](auto a) { + static_assert( + c10::guts::is_instantiation_of::value, + "Internal error."); + // Broadcast regular arguments across NestedTensor constituents. + // This could be a Tensor, integer or anything else really. + if (a.is_leaf()) { + return a.payload(); + } + // Broadcast NestedTensors with one constituent. + if (a.degree() == 1 && !a.is_leaf()) { + return a.children(0); + } + TORCH_CHECK(a.degree() > 0, "Internal assert."); + return a.children(i); + }); + std::apply( + [&result, &fn](Args... filtered) { + result.emplace_back(function_one(fn, filtered...)); + }, + std::move(children)); + } + return NestedNode(std::move(result)); + } +}; + +// TODO: Add static assert to verify lambda arguments match nested_node types +template +static inline NestedNode< + typename c10::guts::infer_function_traits::type::return_type> +map(F&& fn, const NestedNode&... nested_node) { + return _map< + F, + typename c10::guts::infer_function_traits::type::return_type, + typename c10::guts::infer_function_traits::type::parameter_types>:: + function(std::forward(fn), nested_node...); +} + +inline TensorNode get_nested_tensor_structure(at::Tensor tensor) { + if (get_nested_tensor_impl_or_null(tensor) == nullptr) { + return TensorNode(std::move(tensor)); + } + return TensorNode(tensor.unbind()); +} + +inline Tensor wrap_tensor_node( + TensorNode tensor_node, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { + TORCH_CHECK( + !tensor_node.is_leaf(), "Expected TensorNode to wrap a list of Tensors."); + TensorOptions options_ = + TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory( + pin_memory); + if (tensor_node.degree() == 0) { + return wrap_buffer(ones({0}, dtype, layout, device), ones({})); + } + + // Fast path: if all tensors are on CPU, have contiguous memory, and the same + // dtype, copying can be done much faster. + bool all_tensors_cpu = true; + bool all_tensors_contiguous = true; + bool all_tensors_same_dtype = true; + auto first_dtype = tensor_node.children(0).dtype(); + std::vector start_offsets(tensor_node.degree()); + start_offsets[0] = 0; + long total_size = 0; + for (const auto i : c10::irange(tensor_node.degree())) { + all_tensors_cpu = all_tensors_cpu && tensor_node.children(i).is_cpu(); + all_tensors_contiguous = + all_tensors_contiguous && tensor_node.children(i).is_contiguous(); + all_tensors_same_dtype = all_tensors_same_dtype && + (first_dtype == tensor_node.children(i).dtype()); + if (!(all_tensors_cpu && all_tensors_contiguous && + all_tensors_same_dtype)) { + break; + } + if (i > 0) { + start_offsets[i] = + start_offsets[i - 1] + tensor_node.children(i - 1).numel(); + } + total_size += tensor_node.children(i).numel(); + } + + TensorOptions options; + Tensor nt_buffer, nt_sizes; + if (all_tensors_cpu && all_tensors_contiguous && all_tensors_same_dtype) { + nt_buffer = at::empty({total_size}, tensor_node.children(0).options()); + nt_sizes = at::empty( + {static_cast(tensor_node.degree()), + static_cast(tensor_node.children(0).sizes().size())}, + TensorOptions().dtype(kLong)); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + at::ScalarType::Half, + at::ScalarType::Bool, + at::ScalarType::BFloat16, + c10::typeMetaToScalarType(first_dtype), + "create_nt_buffer", + [&]() { + at::parallel_for( + 0, tensor_node.degree(), 1, [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; ++i) { + // Only try copying memory if there is more than 0 elements + // for a certain tensor + if (tensor_node.children(i).numel() > 0) { + memcpy( + nt_buffer.mutable_data_ptr() + start_offsets[i], + tensor_node.children(i).const_data_ptr(), + tensor_node.children(i).numel() * sizeof(scalar_t)); + } + } + }); + }); + long sizes_offset = 0; + for (size_t i = 0; i < tensor_node.degree(); ++i) { + auto tensor_sizes = tensor_node.children(i).sizes(); + for (int64_t tensor_size : tensor_sizes) { + nt_sizes.mutable_data_ptr()[sizes_offset++] = tensor_size; + } + } + options = nt_buffer.options().merge_in(options_); + } else { // Slow path + std::vector flat_tensors; + std::vector sizes; + for (const auto i : c10::irange(tensor_node.degree())) { + flat_tensors.push_back(tensor_node.children(i).reshape(-1).contiguous()); + sizes.push_back( + tensor(c10::IntArrayRef(tensor_node.children(i).sizes()))); + } + options = flat_tensors[0].options().merge_in(options_); + nt_buffer = at::cat(flat_tensors); + nt_sizes = at::native::stack(sizes); + } + + return wrap_buffer(nt_buffer.to(options), nt_sizes); +} + +} // namespace impl + +// This function is meant to ease rapid operator coverage for +// NestedTensor kernels. It is not meant to be efficient. Use it judiciously. +template +inline at::Tensor map_nested_tensor(F&& fn, A... a) { + return wrap_tensor_node( + impl::map(std::forward(fn), impl::get_nested_tensor_structure(a)...), + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h new file mode 100644 index 0000000000000000000000000000000000000000..0259b4e9f9fd3b2019534ac8c279a5deefa22a6b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h @@ -0,0 +1,133 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +TORCH_API Tensor& quantize_tensor_per_tensor_affine( + const Tensor& rtensor, + Tensor& qtensor, + double scale, + int64_t zero_point); +TORCH_API Tensor& quantize_tensor_per_channel_affine( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + Tensor zero_points, + int64_t axis); + +TORCH_API Tensor& quantize_tensor_per_channel_float_qparams( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +TORCH_API Tensor& dequantize_tensor_per_tensor_affine( + const Tensor& qtensor, + Tensor& rtensor, + double scale, + int64_t zero_point); +TORCH_API Tensor& dequantize_tensor_per_channel_affine( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + Tensor zero_points, + int64_t axis); +TORCH_API Tensor& dequantize_tensor_per_channel_float_qparams( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_tensor_affine_fn = + void (*)(const Tensor& rtensor, Tensor& qtensor, double scale, int64_t zero_point); + +using quantize_tensor_per_channel_affine_fn = void (*)( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_channel_float_qparams_fn = void (*)( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using dequantize_tensor_per_tensor_affine_fn = + void (*)(const Tensor& qtensor, Tensor& rtensor, double scale, int64_t zero_point); + +using dequantize_tensor_per_channel_affine_fn = void (*)( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using dequantize_tensor_per_channel_float_qparams_fn = void (*)( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(const Tensor& rtensor, Tensor& qtensor, float scale, float zero_point); + +using dequantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(const Tensor& qtensor, Tensor& rtensor, float scale, float zero_point); + +DECLARE_DISPATCH( + quantize_tensor_per_tensor_affine_fn, + quantize_tensor_per_tensor_affine_stub) +DECLARE_DISPATCH( + quantize_tensor_per_channel_affine_fn, + quantize_tensor_per_channel_affine_stub) +DECLARE_DISPATCH( + quantize_tensor_per_channel_float_qparams_fn, + quantize_tensor_per_channel_float_qparams_stub) + +DECLARE_DISPATCH( + dequantize_tensor_per_tensor_affine_fn, + dequantize_tensor_per_tensor_affine_stub) +DECLARE_DISPATCH( + dequantize_tensor_per_channel_affine_fn, + dequantize_tensor_per_channel_affine_stub) +DECLARE_DISPATCH( + dequantize_tensor_per_channel_float_qparams_fn, + dequantize_tensor_per_channel_float_qparams_stub) + +DECLARE_DISPATCH( + quantize_tensor_per_tensor_affine_sub_byte_fn, + quantize_tensor_per_tensor_affine_sub_byte_stub) + +DECLARE_DISPATCH( + dequantize_tensor_per_tensor_affine_sub_byte_fn, + dequantize_tensor_per_tensor_affine_sub_byte_stub) + +template +TORCH_API Tensor quantize_tensor( + Tensor rtensor, + Tensor qtensor, + double scale, + int64_t zero_point); +template +TORCH_API Tensor dequantize_tensor( + Tensor qtensor, + Tensor rtensor, + double scale, + int64_t zero_point); + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h new file mode 100644 index 0000000000000000000000000000000000000000..4f6c4e4ec47cb3511fbf34a7da353c67a977e685 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +// Quantize a float value into a uint value given scale and zero_point +template +TORCH_API T quantize_val(double scale, int64_t zero_point, float value); +// TODO combine this with quantize_val once the numerics for ARM are aligned +// with it +template +T quantize_val_arm( + const float scale, + const int32_t zero_point, + const float value); +template +void quantize_vec( + double scale, + int64_t zero_point, + const float* src, + T* dst, + size_t count = 8); +template +TORCH_API float dequantize_val(double scale, int64_t zero_point, T value); +template +TORCH_API float dequantize_vec( + double scale, + int64_t zero_point, + const T* src, + float* dst, + size_t count = 8); +template +TORCH_API DST_T requantize_val(double /*src_scale*/, int64_t /*src_zero_point*/, double /*dst_scale*/, int64_t /*dst_zero_point*/, SRC_T src); + +// Given a multiplier and a zero_point, requantize int32_t computed values back +// to quantized values. See comment above +// make_per_tensor_affine_quantizer function for the usage of int64_t +template +TORCH_API DST_T +requantize_from_int(double multiplier, int64_t zero_point, int64_t src); + +int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax); + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/ConvUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/ConvUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..b80cd4458b2cfd48d34f02caa6e73e95fe13aba4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/ConvUtils.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native::quantized { +namespace { +// MakeConvOutputShape used from both CPU and CUDA libraries +// and exporting symbol from torch_cpu would probably take more storage +// than duplicating implementation which likely be inlined away +template +at::SmallVector MakeConvOutputShape( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const torch::List& stride, + const torch::List& padding, + const torch::List& dilation); + +#if defined(USE_CUDA) || defined(USE_PYTORCH_QNNPACK) +template <> +at::SmallVector MakeConvOutputShape<2>( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const at::List& stride, + const at::List& padding, + const at::List& dilation) { + const int H = input_image_shape[0]; + const int W = input_image_shape[1]; + const int64_t Y_H = + (H + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; + const int64_t Y_W = + (W + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; + return {N, M, Y_H, Y_W}; +} + +template <> +at::SmallVector MakeConvOutputShape<3>( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const at::List& stride, + const at::List& padding, + const torch::List& dilation) { + const int D = input_image_shape[0]; + const int H = input_image_shape[1]; + const int W = input_image_shape[2]; + const int64_t Y_D = + (D + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; + const int64_t Y_H = + (H + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; + const int64_t Y_W = + (W + 2 * padding[2] - dilation[2] * (kernel[2] - 1) - 1) / stride[2] + 1; + return {N, M, Y_D, Y_H, Y_W}; +} + +#endif +} // anonymous namespace +} // namespace at::native::quantized + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/Copy.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..292f10e3839b4093308066f1cd4f37ca3eb30468 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/Copy.h @@ -0,0 +1,13 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src); +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h new file mode 100644 index 0000000000000000000000000000000000000000..a8d904e848059f82dea19ac81bb53b08e201eee6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h @@ -0,0 +1,72 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at { + +struct TensorIterator; + +namespace native { + +using fake_quant_tensor_cachemask_fn = void (*)( + Tensor& output, + Tensor& mask, + const Tensor& input, + float sc, + int64_t z_point, + int64_t quant_min, + int64_t quant_max); + +using fake_quant_tensor_cachemask_tensor_qparams_fn = void (*)( + Tensor& output, + Tensor& mask, + const Tensor& input, + const Tensor& sc, + const Tensor& z_point, + const Tensor& fake_quant_enabled, + int64_t quant_min, + int64_t quant_max); + +using fake_quant_learnable_grad_tensor_fn = void (*)( + TensorIterator& iter, + float scale, + float inv_scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + float grad_factor); + +DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub) +DECLARE_DISPATCH(fake_quant_tensor_cachemask_tensor_qparams_fn, fake_quant_tensor_cachemask_tensor_qparams_stub) +DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub) + +using fake_quant_per_channel_fn = void (*)( + TensorIterator &iter, + int64_t quant_min, + int64_t quant_max); + +using fake_quant_per_channel_cachemask_fn = void (*)( + TensorIterator &iter, + TensorIterator &iter_mask, + int64_t quant_min, + int64_t quant_max); + +DECLARE_DISPATCH(fake_quant_per_channel_cachemask_fn, fake_quant_per_channel_cachemask_stub) + +using fake_quant_learnable_per_channel_fn = void (*)( + TensorIterator &iter, + int64_t quant_min, + int64_t quant_max, + float grad_factor); + +DECLARE_DISPATCH(fake_quant_learnable_per_channel_fn, fake_quant_grad_learnable_channel_stub) + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/IndexKernel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/IndexKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ca6d5c92b47660519c64dd78e59ec539d42b9b65 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/IndexKernel.h @@ -0,0 +1,18 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { +using masked_fill_kernel_quantized_fn = void(*)(TensorIterator& iter, const Scalar& value, double scale, int zero_point); +using index_put_kernel_quantized_fn = void(*)(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate, double scale, int zero_point); + +DECLARE_DISPATCH(masked_fill_kernel_quantized_fn, masked_fill_kernel_quantized_stub) +DECLARE_DISPATCH(index_put_kernel_quantized_fn, index_put_kernel_quantized_stub) + + +} // at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/PackedParams.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/PackedParams.h new file mode 100644 index 0000000000000000000000000000000000000000..6f47b5d67212f36cfa96aea8260acb5aafa4da09 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/PackedParams.h @@ -0,0 +1,139 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +struct LinearPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + + // out variant of LinearPackedParamsBase::apply + virtual at::Tensor& apply_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + TORCH_CHECK(false, "apply_out is not implemented for this packed parameter type"); + return output; + } + + virtual at::Tensor& apply_relu_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + TORCH_CHECK(false, "apply_relu_out is not implemented for this packed parameter type"); + return output; + } + + // Corresponding pattern (the ops with `*` are part of the pattern that + // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32): + // input -> q* -> dq* -> linear* -> + // qweight -> dq* / + // + // After fusion: + // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* -> + // qweight / + // + // Additional Note: the weight is packed as well + // Params: + // X: float32 Tensor, will be quantized to quint8 in the op + // W_prepack: packed qint8 quantized weight and bias + // Returns: + // Y: float32 Tensor + virtual at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + TORCH_CHECK(false, "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed parameter type"); + return {}; + } + + // Corresponding pattern (the ops with `*` are part of the pattern that + // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32): + // input -> q* -> dq* -> linear* -> relu* -> + // qweight -> dq* / + // + // After fusion: + // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* -> + // qweight / + // + // Additional Note: the weight is packed as well + // Params: + // input: float32 Tensor, will be quantized to quint8 in the op + // Returns: + // float32 Tensor + virtual at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + TORCH_CHECK(false, "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed parameter type"); + return {}; + } + + virtual at::Tensor apply_dynamic( + at::Tensor input, + bool reduce_range = false) = 0; + virtual at::Tensor apply_dynamic_relu( + at::Tensor input, + bool reduce_range = false) = 0; + + virtual at::Tensor& apply_dynamic_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + TORCH_CHECK(false, "apply_dynamic_out is not implemented for this packed parameter type"); + return output; + } + virtual at::Tensor& apply_dynamic_relu_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + TORCH_CHECK(false, "apply_dynamic_relu_out is not implemented for this packed parameter type"); + return output; + } + + virtual std::tuple> unpack() = 0; + + virtual std::optional bias() = 0; + + virtual void set_bias(std::optional /*bias*/) { + TORCH_CHECK(false, "set_bias is not implemented for this packed parameter type"); + } +}; + +template +struct ConvPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) = 0; + + virtual std::tuple> unpack() = 0; + + virtual torch::List stride() const = 0; + virtual torch::List padding() const = 0; + virtual torch::List output_padding() const = 0; + virtual torch::List dilation() const = 0; + virtual int64_t groups() const = 0; + virtual bool transpose() const = 0; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/ACLUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/ACLUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..fe4c17a6bc88f2bb7dea2c015cb19aa6c4502e26 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/ACLUtils.h @@ -0,0 +1,262 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#if AT_MKLDNN_ACL_ENABLED() + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Utilities for Arm Compute Library (ACL) quantized operations +// Provides interfaces to leverage ACL's accelerated kernels for statically and +// dynamically quantized matmuls (i.e. qlinear and qlinear_dynamic) These are +// utalized through PackedLinearWeightsACL which extends +// PackedLinearWeightsOnednn Note that PackedLinearWeightsACL extends rather +// than replaces PackedLinearWeightsOnednn for AArch64 because ACL currently +// only supports per_tensor weight quantization. +namespace at::native::acl_utils { + +using QuantMatmulCacheKey = std::tuple< + int64_t, // M + bool, // FUSE_RELU + int64_t, // NUM_THREADS + double, // INPUT_SCALE + int64_t, // INPUT_OFFSET + double, // OUTPUT_SCALE + int64_t, // OUTPUT_OFFSET + bool // SIGNED_INPUT + >; + +enum class QuantMatmulCacheKeyIndex { + M, + FUSE_RELU, + NUM_THREADS, + INPUT_SCALE, + INPUT_OFFSET, + OUTPUT_SCALE, + OUTPUT_OFFSET, + SIGNED_INPUT +}; + +// Abstract interface to share common stuff between static/dynamic ACL matmuls. +struct QuantMatmul { + arm_compute::NEGEMMLowpMatrixMultiplyCore gemm; + // key for use in the cache + QuantMatmulCacheKey key; + + QuantMatmul( + int64_t weight_dim_0, + int64_t weight_dim_1, + double weight_scale, + int64_t weight_offset, + int8_t* weight_ptr, + std::optional bias_ptr, + const QuantMatmulCacheKey& cache_key); + + virtual ~QuantMatmul(); + virtual arm_compute::Status validate() = 0; + virtual void configure() = 0; + + protected: + arm_compute::Tensor wei_q_tensor_; + std::optional bia_tensor_; + arm_compute::GEMMInfo gemm_info_; + std::optional relu_info_; +}; + +struct DynamicQuantMatmul : public QuantMatmul { + arm_compute::Tensor src_q_tensor; + arm_compute::Tensor src_tensor; + arm_compute::Tensor dst_tensor; + arm_compute::NEQuantizationLayer quant; + // We need a ReLU layer here (unlike static quantization) because the ReLU + // cannot be "truly" fused with the GEMM through gemm_info in ACL dynamically + // quantized matmuls. + std::optional relu; + + DynamicQuantMatmul( + int64_t weight_dim_0, + int64_t weight_dim_1, + double weight_scale, + int64_t weight_offset, + int8_t* weight_ptr, + std::optional bias_ptr, + const QuantMatmulCacheKey& cache_key); + + ~DynamicQuantMatmul() override; + + arm_compute::Status validate() override; + void configure() override; + + private: + at::Tensor src_q_tensor_orig_; +}; + +struct StaticQuantMatmul : public QuantMatmul { + arm_compute::Tensor src_q_tensor; + arm_compute::Tensor dst_q_tensor; + + StaticQuantMatmul( + int64_t weight_dim_0, + int64_t weight_dim_1, + double weight_scale, + int64_t weight_offset, + int8_t* weight_ptr, + std::optional bias_ptr, + const QuantMatmulCacheKey& cache_key); + + ~StaticQuantMatmul() override; + + arm_compute::Status validate() override; + void configure() override; + + private: + std::optional bia_q_tensor_; + std::optional bia_q_tensor_orig_; +}; + +struct QuantAdd { + arm_compute::Tensor qa_tensor; + arm_compute::Tensor qb_tensor; + arm_compute::Tensor qdst_tensor; + arm_compute::NEArithmeticAddition q_add; + + QuantAdd( + arm_compute::DataType dtype, + const std::vector& input_dims, + double qa_scale, + int64_t qa_offset, + double qb_scale, + int64_t qb_offset, + double dst_scale, + int64_t dst_offset); + + arm_compute::Status validate(); + void configure(); + + private: + arm_compute::ConvertPolicy policy{arm_compute::ConvertPolicy::SATURATE}; +}; + +} // namespace at::native::acl_utils +struct PackedLinearWeightsACL : public PackedLinearWeightsOnednn { + using ACLQuantMatmul = at::native::acl_utils::QuantMatmul; + using ACLDynamicQuantMatmul = at::native::acl_utils::DynamicQuantMatmul; + using ACLStaticQuantMatmul = at::native::acl_utils::StaticQuantMatmul; + using ACLQuantMatmulCacheKey = at::native::acl_utils::QuantMatmulCacheKey; + using ACLQuantMatmulCacheKeyIndex = + at::native::acl_utils::QuantMatmulCacheKeyIndex; + + PackedLinearWeightsACL( + std::unique_ptr weight, + std::optional bias, + at::Tensor orig_weight, + std::optional orig_bias); + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + template + std::shared_ptr get_acl_quant_matmul( + const ACLQuantMatmulCacheKey& key) { + return std::dynamic_pointer_cast( + fetch_or_create_acl_quant_matmul(key)); + } + + private: + int64_t k_; + int64_t n_; + int64_t weight_zero_point_; + double weight_scale_; + + // A 2 element (per layer) cache. Given it's not intended to store more than 2 + // elements, we do not need a fancy implementation. The idea behind it is to + // allow for a (configuration free) fast path for autoregressive + // transformer-like models which usually involve 2 input tensor shapes; one + // for the prefill phase and another for the autoregressive phase + std::array, 2> cache_; + + template + std::shared_ptr fetch_or_create_acl_quant_matmul( + const ACLQuantMatmulCacheKey& key) { + // We're only maintaining a 2 element LRU cache + // hit first + if (cache_[0] != nullptr && cache_[0]->key == key) { + return cache_[0]; + } + // hit second + if (cache_[1] != nullptr && cache_[1]->key == key) { + // Update LRU + std::swap(cache_[0], cache_[1]); + return cache_[0]; + } + // miss -> replace Least Recently Used - i.e. element at index 1 + cache_[1] = create_acl_quant_matmul(key); + std::swap(cache_[0], cache_[1]); + return cache_[0]; + } + + template + std::shared_ptr create_acl_quant_matmul( + const ACLQuantMatmulCacheKey& key) { + std::optional bias_ptr; + if (bias_.has_value()) { + bias_ptr = (float*)bias_.value().get_data_handle(); + } + auto acl_gemm = std::make_shared( + k_, + n_, + weight_scale_, + weight_zero_point_, + (int8_t*)weight_.get()->get_data_handle(), + bias_ptr, + key); + + // validate + auto status = acl_gemm->validate(); + if (status.error_code() != arm_compute::ErrorCode::OK) { + TORCH_WARN( + "Arm Compute Library's Quantized Matmul Validation Failed: " + + status.error_description()); + return nullptr; + } + + // configure + acl_gemm->configure(); + return acl_gemm; + } + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false); + + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point); +}; + +#endif // AT_MKLDNN_ACL_ENABLED() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..a27322debae0ebfe98adac788080b5db451e4aa8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h @@ -0,0 +1,11 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +namespace at::native { +TORCH_API Tensor +quantized_add(Tensor qa, Tensor qb, double scale, int64_t zero_point); +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h new file mode 100644 index 0000000000000000000000000000000000000000..30e18dde45a889df235adbf337cc642d89678c4c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor embeddingbag_byte( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) = 0; + + virtual at::Tensor embeddingbag_4bit( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) = 0; + + virtual at::Tensor unpack() = 0; + + virtual int64_t bit_rate() const = 0; + virtual int64_t version() const = 0; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..5215485ef744b4f905c5b6f0ac48fcc7f1c33514 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h @@ -0,0 +1,506 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#if AT_MKLDNN_ENABLED() +#include +#include +#include +#if !defined(__powerpc__) +#include +#endif + +#include + +using PrimitiveCacheKey = std::tuple< + double, // input_scale + int64_t, // input_zero_point + std::vector, // input_shape + double, // output_scale + int64_t, // output_zero_point + int64_t, // OMP_number_of_threads + double, // accum_scale + int64_t>; // accum_zero_point + +enum CacheKeyIndex { + InputScale, + InputZeroPoint, + InputShape, + OutputScale, + OutputZeroPoint, + NumOfThreads, +}; + +// Base class of primitive cache +struct PrimitiveCache { + PrimitiveCacheKey key; + + bool hit(const PrimitiveCacheKey& key) { + return this->key == key; + } +}; + +using LinearParams = ideep::matmul_forward_params; +using Conv = dnnl::convolution_forward; +using ConvDesc = dnnl::convolution_forward::primitive_desc; +using ConvParams = ideep::convolution_forward_params; +using Deconv = dnnl::deconvolution_forward; +using DeconvDesc = dnnl::deconvolution_forward::primitive_desc; +using DeconvParams = ideep::deconv_forward_params; + +struct LinearPrimitiveCache : PrimitiveCache { + LinearPrimitiveCache() = default; + + LinearPrimitiveCache( + const PrimitiveCacheKey& key, + const LinearParams& param) { + this->key = key; + this->param = param; + } + + LinearParams param; + + // For dynamic qlinear, scale and zero point + // are set at execution time. So we only need to compare + // the rest part of key. + bool hit_dynamic(const PrimitiveCacheKey& new_key) { + auto const& cached_input_shape = std::get(this->key); + auto const& new_input_shape = std::get(new_key); + return ( + cached_input_shape == new_input_shape && + std::get(this->key) == std::get(new_key)); + } + + LinearParams& get_param() { + return param; + } +}; + +struct ConvPrimitiveCache : PrimitiveCache { + ConvPrimitiveCache() = default; + + ConvPrimitiveCache( + const PrimitiveCacheKey& key, + const ConvParams& params) { + this->key = key; + this->params = params; + } + + ConvParams params; + + ConvParams& get_params() { + return params; + } +}; + +struct DeconvPrimitiveCache : PrimitiveCache { + DeconvPrimitiveCache() = default; + + DeconvPrimitiveCache( + const PrimitiveCacheKey& key, + const DeconvParams& params) { + this->key = key; + this->params = params; + } + + DeconvParams params; + + DeconvParams& get_params() { + return params; + } +}; + +enum PostOps { + NoPostOp, + Relu, + LeakyRelu, + Tanh, + Gelu +}; + + +struct PackedLinearWeightsOnednn : public LinearPackedParamsBase { + PackedLinearWeightsOnednn( + std::unique_ptr weight, + std::optional bias, + at::Tensor orig_weight, + std::optional orig_bias) + : weight_(std::move(weight)), + bias_(std::move(bias)), + orig_weight_(std::move(orig_weight)), + orig_bias_(std::move(orig_bias)) { + cache_initialized_flag = std::make_unique(); + } + std::unique_ptr weight_; + std::optional bias_; + at::Tensor orig_weight_; + std::optional orig_bias_; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override; + + at::Tensor apply_leaky_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point, + double negative_slope); + + at::Tensor apply_tanh( + at::Tensor input, + double output_scale, + int64_t output_zero_point); + + std::tuple> unpack() override; + + std::optional bias() override { + return orig_bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + private: + LinearPrimitiveCache prim_cache; + std::unique_ptr cache_initialized_flag; + + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point, + torch::List post_op_args = torch::List()); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range=false); + + LinearPrimitiveCache& get_cache() { + return prim_cache; + } +}; + +template +struct PackedConvWeightsOnednn : public ConvPackedParamsBase { + PackedConvWeightsOnednn( + std::unique_ptr weight, + std::optional bias, + at::Tensor orig_weight, + std::optional orig_bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + uint8_t transpose) + : weight_(std::move(weight)), + bias_(std::move(bias)), + orig_weight_(std::move(orig_weight)), + orig_bias_(std::move(orig_bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose) { + cache_initialized_flag = std::make_unique(); + } + + std::unique_ptr weight_; + std::optional bias_; + at::Tensor orig_weight_; + std::optional orig_bias_; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + uint8_t transpose_; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override; + + at::Tensor apply_add( + const at::Tensor& input, + const at::Tensor& accum, + double output_scale, + int64_t output_zero_point); + + at::Tensor apply_add_relu( + const at::Tensor& input, + const at::Tensor& accum, + double output_scale, + int64_t output_zero_point); + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return (bool)transpose_; + } + + private: + ConvPrimitiveCache conv_prim_cache; + DeconvPrimitiveCache deconv_prim_cache; + std::unique_ptr cache_initialized_flag; + + template + at::Tensor apply_impl( + const at::Tensor& input, + const std::optional& accum, + double output_scale, + int64_t output_zero_point); + + ConvPrimitiveCache& get_conv_cache() { + assert(!transpose()); + return conv_prim_cache; + } + + DeconvPrimitiveCache& get_deconv_cache() { + assert(transpose()); + return deconv_prim_cache; + } +}; + +namespace onednn_utils { + +inline ideep::attr_t create_attr_by_post_op( + const std::string_view& binary_post_op, + double binary_alpha, + double input1_scale, + int64_t input1_zero_point, + const ideep::tensor::desc& input1_desc, + const std::string_view& unary_post_op, + const torch::List>& unary_post_op_args, + const std::string_view& unary_post_op_algorithm) { + using ideep::tensor; + if (binary_post_op == "none") { + if (unary_post_op == "relu") { + return ideep::attr_t::fuse_relu(); + } else if (unary_post_op == "leaky_relu") { + TORCH_CHECK( + unary_post_op_args.size() == 1, + "onednn qlinear: expect one argument for post op leaky_relu but got ", unary_post_op_args.size(), " args"); + auto alpha = unary_post_op_args[0].value().to(); + return ideep::attr_t::fuse_relu_v2(alpha); + } else if (unary_post_op == "tanh") { + return ideep::attr_t::fuse_tanh(); + } else if (unary_post_op == "gelu") { + TORCH_CHECK( + unary_post_op_algorithm == "none" || unary_post_op_algorithm == "tanh", + "onednn qlinear: algorithm for post op gelu must be none or tanh but got ", unary_post_op_algorithm); + auto post_algorithm = unary_post_op_algorithm == "none" ? + dnnl::algorithm::eltwise_gelu_erf : + dnnl::algorithm::eltwise_gelu_tanh; + return ideep::attr_t::fuse_gelu_v2(0.f, 0.f, post_algorithm); + } else if (unary_post_op == "hardtanh") { + TORCH_CHECK( + unary_post_op_args.size() == 2 && + unary_post_op_args[0].has_value() && + unary_post_op_args[1].has_value(), + "hardtanh is expected to have two scalar input: min_val and max_val"); + auto lower_bound_value = + unary_post_op_args[0].value().to(); + auto upper_bound_value = + unary_post_op_args[1].value().to(); + return ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value); + } else if (unary_post_op == "hardswish") { + return ideep::attr_t::fuse_hardswish(); + } else if (unary_post_op == "swish") { + return ideep::attr_t::fuse_swish(); + } else { + TORCH_CHECK( + unary_post_op == "none", + "onednn qlinear: unsupported unary post op ", unary_post_op); + } + } else if (binary_post_op == "sum") { + if (unary_post_op == "none") { + return ideep::attr_t::fuse_sum(input1_scale, input1_zero_point); + } else if (unary_post_op == "relu") { + return ideep::attr_t::residual_with_sum_zero_point(input1_scale, input1_zero_point); + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op sum"); + } + } else if (binary_post_op == "add") { + if (unary_post_op == "none") { + return ideep::attr_t::fuse_binary(ideep::algorithm::binary_add, input1_desc); + } else if (unary_post_op == "relu") { + ideep::post_ops po; + po.append_binary(ideep::algorithm::binary_add, input1_desc); + po.append_eltwise(ideep::algorithm::eltwise_relu, 0, 0); + return ideep::attr_t::attr_post_ops(po); + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op add"); + } + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported binary post op ", binary_post_op); + } + return ideep::attr_t(); +} + +// ONEDNN requires symmetric quantization of weight +// Use this util function to check. +inline bool is_weight_symmetric_quant( + const at::Tensor& weight, + bool is_transposed_conv) { + bool is_symmetric = true; + const auto qtype = weight.qscheme(); + if (qtype == c10::kPerTensorAffine) { + is_symmetric &= (weight.q_zero_point() == 0); + } else if (qtype == c10::kPerChannelAffine) { + if (is_transposed_conv) { + // This case is currently not supported in PyTorch + // but we do not want to raise an error in this util function. + is_symmetric = false; + } else { + auto output_channels = weight.size(0); + for (int i = 0; i < output_channels; ++i) { + auto zp = weight.q_per_channel_zero_points()[i].item(); + is_symmetric &= (zp == 0); + } + } + } else { + // This case is currently not supported in PyTorch + // but we do not want to raise an error in this util function. + is_symmetric = false; + } + return is_symmetric; +} + +// When qengine is x86, use this util func to check if onednn kernel +// is preferred than fbgemm's to get better performance. +inline bool should_use_onednn_quant( + const at::Tensor& weight, + bool is_transposed_conv, + int groups, + torch::List output_padding) { + // Performance of onednn is only validated on Linux right now. + // Also, the heuristics for dispatching are based on perf data on Linux. + // So, for x86 qengine, we always use fbgemm kernels if OS is not Linux. + // TODO Support more OSs. +#if !defined(__linux__) + return false; +#else +#if defined(__powerpc__) + constexpr auto vnni_available = true; +#else + const auto vnni_available = cpuinfo_has_x86_avx512vnni(); +#endif + bool w_sym_quant = + is_weight_symmetric_quant(weight, is_transposed_conv); + bool opad_all_zero = + std::all_of(output_padding.begin(), output_padding.end(), [](int i) { return i==0; }); + return vnni_available && (groups <= 100) && w_sym_quant && opad_all_zero; +#endif +} + +} // onednn_utils + +at::Tensor _qconv_prepack_onednn( + at::Tensor weight, // from CPU backend instead of QuantizedCPU + at::Tensor weight_scales, // Weight zero points must be 0 for onednn + double input_scale, + int64_t input_zero_point, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + std::optional> input_shape=std::nullopt); + +#define FP8E4M3_MAX 448.0 + +#define CACHE_ONEDNN_CONTEXT_FLAG "ONEDNN_CACHE_CONTEXT_UNSAFE" + +struct QlinearForwardParams { + dnnl::matmul primitive; + ideep::exec_args args; + ideep::tensor packed_weight; + ideep::tensor weight_scales; + std::optional src_scale; + std::optional src_zero_point; + std::optional dst_scale; + std::optional dst_zero_point; + std::optional bias; + ideep::tensor scratchpad; + + void init_args() { + args.insert({DNNL_ARG_WEIGHTS, packed_weight}); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad}); + if (bias.has_value()) { + args.insert({DNNL_ARG_BIAS, bias.value()}); + } + if (src_scale.has_value()) { + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scale.value()}); + } + if (dst_scale.has_value()) { + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scale.value()}); + } + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, weight_scales}); + if (src_zero_point.has_value()) { + args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, src_zero_point.value()}); + } + if (dst_zero_point.has_value()) { + args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zero_point.value()}); + } + } +}; + +#endif // #if AT_MKLDNN_ENABLED() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..f9f887f04e3c3f80e568f005d1d244f55af3ce60 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h @@ -0,0 +1,513 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_PYTORCH_QNNPACK +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include +inline int kPaddingChannels = 8; +struct QnnpackOperatorDeleter { + void operator()(pytorch_qnnp_operator_t op) { + pytorch_qnnp_delete_operator(op); + } +}; + +// PackedWeight struct for QNNPACK stores the original Weight and Bias as +// QNNPACK currently does not support an unpack function. +// For PyTorch Mobile, once the model is scripted and serialized we don't need +// to call unpack, so we can save some memory by checking for this case and free +// the original weights after packing. +// Input scale is set to null in pre-pack step. QNNPACK needs bias quantized +// with input scale which is available at runtime in pytorch. During runtime if +// input scale value changes then we requantize bias with the updated scale. For +// inference we expect the graph to be static so the input scale should not +// change across consecutive inference calls. +struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { + PackedLinearWeightsQnnp( + std::unique_ptr w, + at::Tensor orig_weight, + at::Tensor bias, + std::optional input_scale, + at::Tensor w_scales, + std::vector&& w_zps) + : w(std::move(w)), + orig_weight(std::move(orig_weight)), + bias_(at::native::mobile::allocate_padded_contiguous_if_needed( + bias, bias.suggest_memory_format())), + per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine), + input_scale(std::move(input_scale)), + w_scales(std::move(w_scales)), + w_zero_points(std::move(w_zps)), + q_scheme(this->orig_weight.qscheme()) { + weight_sizes = this->orig_weight.sizes().vec(); + } + + std::unique_ptr w; + at::Tensor orig_weight; + at::Tensor bias_; + bool per_channel_; + std::optional input_scale; + at::Tensor w_scales; + std::vector w_zero_points; + std::vector requantization_scales; + std::vector weight_sizes; + c10::QScheme q_scheme; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + bool per_channel() const { + return per_channel_; + } + + private: + std::mutex qnnp_mutex_; + +#ifdef USE_XNNPACK + xnnpack_operator xnnp_linear_op; + + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK + + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range); +}; + +template +struct PackedConvWeightsQnnp : public ConvPackedParamsBase { + PackedConvWeightsQnnp( + std::unique_ptr w, + at::Tensor orig_weight, + at::Tensor bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose, + std::optional input_scale, + std::vector kernel, + at::Tensor w_scale, + std::vector&& w_zps, + bool is_per_channel) + : w(std::move(w)), + orig_weight(std::move(orig_weight)), + bias(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose), + is_per_channel_(is_per_channel), + input_scale(input_scale), + kernel_(std::move(kernel)), + w_scales(std::move(w_scale)), + w_zero_points(std::move(w_zps)) { + const bool any_padding = std::any_of( + padding_.begin(), padding_.end(), [](const auto& e) { return e != 0; }); + const size_t kernel_size = + std::accumulate(kernel_.begin(), kernel_.end(), 1, std::multiplies<>()); + + const size_t group_input_channels = transpose + ? this->orig_weight.size(0) / groups + : this->orig_weight.size(1); + const size_t group_output_channels = transpose + ? this->orig_weight.size(1) + : this->orig_weight.size(0) / groups; + + const size_t kernel_depth = kSpatialDim == 3 ? kernel_[0] : 1; + const size_t kernel_height = kernel_[kSpatialDim - 2]; + const size_t kernel_width = kernel_[kSpatialDim - 1]; + + pytorch_qnnp_ukernel_type ukernel_type; + if (transpose_) { + ukernel_type = pytorch_qnnp_ukernel_type_conv; + } else { + ukernel_type = pytorch_qnnp_ukernel_type_none; + + const bool has_depthwise_dimensions = + (kSpatialDim == 2 && + ((kernel_height == 3 && kernel_width == 3) || + (kernel_height == 5 && kernel_width == 5))) || + (kSpatialDim == 3 && kernel_height == 3 && kernel_width == 3 && + kernel_depth == 3); + const bool has_depthwise_grouping = + group_input_channels == 1 && group_output_channels == 1 && groups > 1; + + if (has_depthwise_dimensions && has_depthwise_grouping) { + ukernel_type = pytorch_qnnp_ukernel_type_dwconv; + } else if ( + kernel_size == 1 && + std::all_of( + stride_.begin(), + stride_.end(), + [](const auto& e) { return e == 1; }) && + !any_padding) { + ukernel_type = group_input_channels >= SIZE_MAX + ? pytorch_qnnp_ukernel_type_xzp_gemm + : pytorch_qnnp_ukernel_type_gemm; + } else { + ukernel_type = pytorch_qnnp_ukernel_type_conv; + } + } + + if (is_per_channel && ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) { + TORCH_INTERNAL_ASSERT( + false, "Per channel quantized weights are not supported for XZP kernels"); + } + + pytorch_qnnp_operator_t convolution{nullptr}; + // Initially all the params are set to zero. + convolution = static_cast( + calloc(1, sizeof(struct pytorch_qnnp_operator))); + if (convolution == nullptr) { + TORCH_INTERNAL_ASSERT( + false, "failed to allocate %zu bytes for pytorch_qnnp_operator structure", + sizeof(struct pytorch_qnnp_operator)); + } + + convolution_op = + std::unique_ptr( + convolution); + + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + convolution->ukernel_type = ukernel_type; + convolution->groups = groups; + convolution->group_input_channels = group_input_channels; + convolution->group_output_channels = group_output_channels; + convolution->kernel_depth = kernel_depth; + convolution->kernel_height = kernel_height; + convolution->kernel_width = kernel_width; + convolution->stride_depth = kSpatialDim == 3 ? stride_[0] : 1; + convolution->stride_height = stride_[kSpatialDim - 2]; + convolution->stride_width = stride_[kSpatialDim - 1]; + convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1; + convolution->dilation_height = dilation_[kSpatialDim - 2]; + convolution->dilation_width = dilation_[kSpatialDim - 1]; + convolution->input_padding_height = padding_[kSpatialDim - 2]; + convolution->input_padding_width = padding_[kSpatialDim - 1]; + convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0; + convolution->per_channel = is_per_channel_; + convolution->transpose = transpose_; + + const uint32_t kr = pytorch_qnnp_params.q8conv.kr; + const size_t k_stride = (group_input_channels + (kr - 1)) & -kr; + + size_t zero_size = sizeof(uint8_t) * k_stride; + size_t zero_offset = 0; + + if (transpose_) { + convolution->adjustment_width = output_padding_[1]; + convolution->adjustment_height = output_padding_[0]; + if (group_input_channels < 8) { + zero_size += 8; + zero_offset = 8; + } + } else { + zero_buffer_size = 0; + if (any_padding) { + zero_size = 0; + zero_offset = 0; + if (ukernel_type == pytorch_qnnp_ukernel_type_dwconv) { + const uint32_t cr = pytorch_qnnp_params.q8dw9.cr; + const size_t group_stride = (groups + (cr - 1)) & -cr; + if (groups >= 8) { + zero_size = sizeof(uint8_t) * group_stride; + zero_offset = 0; + } else { + zero_size = sizeof(uint8_t) * group_stride + 8; + zero_offset = sizeof(uint8_t) * 8; + } + } else if ( + ukernel_type == pytorch_qnnp_ukernel_type_conv || + ukernel_type == pytorch_qnnp_ukernel_type_gemm) { + if (group_input_channels >= 8) { + zero_size = sizeof(uint8_t) * k_stride; + zero_offset = 0; + } else { + zero_size = sizeof(uint8_t) * k_stride + 8; + zero_offset = 8; + } + } + } + } + + // NOLINTNEXTLINE(clang-analyzer-optin.portability.UnixAPI) + void* zero_buffer = malloc(zero_size); + if (zero_buffer == nullptr) { + pytorch_qnnp_delete_operator(convolution); + TORCH_INTERNAL_ASSERT( + false, "failed to allocate %zu bytes for zero padding", + zero_size); + } + // Need to set to input zero point + // memset(zero_buffer, input_zero_point, zero_size); + zero_buffer_size = zero_size; + convolution->zero_buffer = zero_buffer; + convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset); + } + + std::unique_ptr convolution_op; + #ifdef USE_XNNPACK + xnnpack_operator xnnp_convolution_op; + #endif // USE_XNNPACK + std::unique_ptr w; + at::Tensor orig_weight; + at::Tensor bias; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + bool transpose_; + bool is_per_channel_; + std::optional input_scale; + std::vector kernel_; + at::Tensor w_scales; + std::vector w_zero_points; + std::vector requantization_scales; + size_t zero_buffer_size; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range=false) override; + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return transpose_; + } + + bool per_channel() const { + return is_per_channel_; + } + + private: + std::mutex qnnp_mutex_; + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + +#ifdef USE_XNNPACK + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK +}; + +enum class Activation : uint8_t { NONE = 0, RELU = 1 }; + +template +inline T QuantizeValue(float scale, int32_t zero_point, float value) { + const int32_t qmin = std::numeric_limits::min(); + const int32_t qmax = std::numeric_limits::max(); + auto r = zero_point + static_cast(std::nearbyint(value / scale)); + r = std::max(r, qmin); + r = std::min(r, qmax); + return static_cast(r); +} + +template +inline std::pair activationLimits( + float scale, + int32_t zero_point, + Activation Ac) { + switch (Ac) { + case Activation::NONE: + return {std::numeric_limits::min(), + std::numeric_limits::max()}; + case Activation::RELU: + return {QuantizeValue(scale, zero_point, 0.0), + std::numeric_limits::max()}; + default: +#ifdef _MSC_VER + __assume(0); +#else + __builtin_unreachable(); +#endif + } +} + +namespace at::native::qnnp_avgpool_helper { +Tensor qnnpack_avg_pool2d( + Tensor input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + bool ceil_mode, + bool count_include_pad, + std::optional divisor_override); +} // namespace at::native::qnnp_avgpool_helper + +namespace { +[[maybe_unused]] std::vector generate_requantization_scales( + const at::Tensor& weight_scales, + const float input_scale, + const float output_scale, + std::vector& requant_scales) { + // Since weight scale is allocated with padding + // weight_scales.numel() gives us padded num elements. + const auto num_output_channels_padded = weight_scales.numel(); + float *const weight_scales_data = weight_scales.data_ptr(); + if (static_cast(requant_scales.size()) < num_output_channels_padded) { + requant_scales.resize(num_output_channels_padded); + } + for (const auto i : c10::irange(num_output_channels_padded)) { + const auto inverse_output_scale = 1.f /output_scale; + requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale; + TORCH_CHECK( + (requant_scales[i] > 0.0f && std::isnormal(requant_scales[i])), + "failed to create op with requantization scale: ", + requant_scales[i], + ": requantization scale must be finite and positive"); + } + return requant_scales; +} + +[[maybe_unused]] std::pair, at::Tensor> +make_zero_points_and_scales_tensor( + const at::Tensor& weight_contig, + bool transpose = false, + uint32_t groups = 1) { + const int out_ch_idx = transpose ? 1 : 0; + const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1); + // Add 8 to account for buffering needed by QNNPACK. + const auto num_output_channels_padded = num_output_channels + kPaddingChannels; + const auto qtype = weight_contig.qscheme(); + std::vector weight_zp(num_output_channels_padded, 0); + // Adjust weight zero point, similar to weight data. + if (qtype == at::kPerTensorAffine) { + for (const auto i : c10::irange(num_output_channels)) { + weight_zp[i] = (uint8_t)(weight_contig.q_zero_point() + 128); + } + } else if (qtype == at::kPerChannelAffine) { + TORCH_CHECK( + weight_contig.q_per_channel_zero_points().scalar_type() == at::kLong, + "Per channel zero points dtype must be long int."); + const int64_t* per_channel_zero_points = + weight_contig.q_per_channel_zero_points().data_ptr(); + for (const auto i : c10::irange(num_output_channels)) { + weight_zp[i] = (uint8_t)(per_channel_zero_points[i] + 128); + } + } else { + TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme."); + } + at:: Tensor weight_scales = + at::empty( + {num_output_channels_padded}, + at::device(at::kCPU).dtype(at::kFloat)); + float *const weight_scales_data = weight_scales.data_ptr(); + if (qtype == at::kPerTensorAffine) { + for (const auto i : c10::irange(num_output_channels)) { + weight_scales_data[i] = weight_contig.q_scale(); + } + } else if (qtype == at::kPerChannelAffine) { + TORCH_CHECK( + weight_contig.q_per_channel_scales().scalar_type() == at::kDouble, + "Per channel scales dtype must be double."); + const double *const per_channel_scales = + weight_contig.q_per_channel_scales().data_ptr(); + for (const auto i : c10::irange(num_output_channels)) { + weight_scales_data[i] = static_cast(per_channel_scales[i]); + } + } else { + TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme."); + } + for (const auto i : c10::irange(num_output_channels, num_output_channels_padded)) { + weight_scales_data[i] = 1.f; + } + return {weight_zp, weight_scales}; +} +} // namespace + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..02a867a1f95687119fadf4e968ae12a3776d93e3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h @@ -0,0 +1,245 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + +namespace quant_utils { +namespace { + float RawUint16ToFp16(unsigned short value) { + // Convert raw 16 bits half precision floating point number + // to single precision floating point number. + const unsigned short sign_bits = value >> 15; + const unsigned short exponent_bits = value >> 10 & 0x1f; + const unsigned short significand_bits = value & 0x3ff; + + const float sign = sign_bits ? -1 : 1; + const float significand = + 1 + significand_bits * 0.0009765625f; // 0.0009765625f = 0x1p-10 = 2^-10; + const float exponent = exponent_bits - 0xf; + + return sign * std::ldexp(significand, exponent); +} + +template +bool CheckAndSaturate(T max_val, T* element) { + if (*element > max_val) { + *element = max_val; + return true; + } + if (*element < -max_val) { + *element = -max_val; + return true; + } + return false; +} +} +using namespace std; +// A structure to hold quantization parameters 'scale' and 'zero_point'. +// The meaning of these values is as the constants in the quantization equation +// +// real_value = scale * (quantized_value - zero_point) +// +// In other words, 'zero_point' is the quantized value that corresponds +// to the real value 0, and 'scale' is the difference of real values +// corresponding to consecutive quantized values. +struct TensorQuantizationParams { + double scale; + std::int32_t zero_point; + int precision; +}; + +// Use fp16_min as the small scale cutoff because we don't want to use scales in +// fp16 subnormal range. This is to be consistent with Glow and FakeLowP +// implementation for NNPI. +constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f; + +// Following implementation should be identical to fbgemm::ChooseQuantizationParams +inline TensorQuantizationParams ChooseQuantizationParams( + float min, + float max, + int32_t qmin, + int32_t qmax, + bool preserve_sparsity = false, + bool force_scale_power_of_two = false, + bool reduce_range = false) { + TORCH_CHECK( + min <= max, + "In ChooseQuantizationParams, min should be less than or equal to max"); + + if (reduce_range) { + qmin = qmin/2; + qmax = qmax/2; + } + if (min < 0 && max > 0 && preserve_sparsity) { + int symmetric_qmin = -((qmax - qmin) / 2 + 1); + int symmetric_qmax = (qmax - qmin) / 2; + double max_scale = + std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax)); + min = max_scale * symmetric_qmin; + max = max_scale * symmetric_qmax; + } + + // We extend the [min, max] interval to ensure that it contains 0. + // Otherwise, we would not meet the requirement that 0 be an exactly + // representable value. + min = std::min(min, 0.f); + max = std::max(max, 0.f); + + TORCH_CHECK( + qmin < qmax, + "In ChooseQuantizationParams, qmin should be less than qmax"); + + // Use double precision for intermediate computation but use single precision + // in final number to reflect the actual number used during quantization. + double scale = (static_cast(max) - min) / (qmax - qmin); + // If scale is 0 or too small so its reciprocal is infinity, we arbitrary + // adjust the scale to 0.1 . We want to avoid scale's reciprocal being + // infinity because some of fbgemm code pre-computes scale's reciprocal to do + // multiplication instead of division in the time critical part of code. + if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) { + scale = 0.1; + } + TORCH_CHECK(scale > 0, "quantization scale should be > 0"); + + if (force_scale_power_of_two) { + if (scale < 1) { + scale = 1.0 / (1 << static_cast(floor(log(1.0 / scale) / log(2)))); + } else { + scale = 1 << static_cast(ceil(log(scale) / log(2))); + } + } + + // Cut off small scale + if (scale < SMALL_SCALE_THRESHOLD) { + float org_scale = scale; + scale = SMALL_SCALE_THRESHOLD; + // Adjust the min and max based on the new scale + if (min == 0.0f) { + max = SMALL_SCALE_THRESHOLD * (qmax - qmin); + } else if (max == 0.0f) { + min = -SMALL_SCALE_THRESHOLD * (qmax - qmin); + } else { + float amplifier = SMALL_SCALE_THRESHOLD / org_scale; + min *= amplifier; + max *= amplifier; + } + } + + // Zero-point computation. + // First the initial floating-point computation. The zero-point can be + // determined from solving an affine equation for any known pair + // (real value, corresponding quantized value). + // We know two such pairs: (rmin, qmin) and (rmax, qmax). + // The arithmetic error on the zero point computed from either pair + // will be roughly machine_epsilon * (sum of absolute values of terms) + // so we want to use the variant that adds the smaller terms. + double zero_point_from_min = qmin - min / scale; + double zero_point_from_max = qmax - max / scale; + double zero_point_from_min_error = + std::abs(qmin) - std::abs(min / scale); + double zero_point_from_max_error = + std::abs(qmax) - std::abs(max / scale); + double initial_zero_point = + zero_point_from_min_error < zero_point_from_max_error + ? zero_point_from_min + : zero_point_from_max; + + // for symmetric quantization (preserve_sparsity == true), we force zero_point + // to be a middle value between qmin and qmax. + // If either min or max is 0, then we just use 0 as zero_point. + if (min < 0 && max > 0 && preserve_sparsity) { + initial_zero_point = static_cast(qmin + qmax) / 2; + } + + // Now we need to nudge the zero point to be an integer + // (our zero points are integer, and this is motivated by the requirement + // to be able to represent the real value "0" exactly as a quantized value, + // which is required in multiple places, for example in Im2col with zero + // padding). + int32_t nudged_zero_point = 0; + if (initial_zero_point < qmin) { + nudged_zero_point = qmin; + } else if (initial_zero_point > qmax) { + nudged_zero_point = qmax; + } else { + nudged_zero_point = nearbyint(initial_zero_point); + } + + TensorQuantizationParams result; + result.scale = scale; + result.zero_point = nudged_zero_point; + return result; +} + +// This function helps to convert the Conv1D dimensions usable by the Conv2d op. +constexpr int64_t kConv1dSqueezeDim = 0; +[[maybe_unused]] static torch::List MakeArgForConv1d( + const torch::List& arg, + int64_t base_value) { + TORCH_CHECK(!arg.empty(), "Argument must have elements."); + torch::List result({arg.get(0), base_value}); + if (arg.size() == 1) { + result[1] = arg.get(0); + } else { + result[1] = arg.get(1); + } + result[kConv1dSqueezeDim] = base_value; + return result; +} + +// The range for using FP16 quantization of weights requires that the elements +// should be in the range of [5.96e-8, 65504]. If it is out of range, then the +// number will be saturated to max or min representable values by FP16. +inline void HandleWeightsSaturation(int64_t N, float* weight) { + const float kFp16Max = RawUint16ToFp16(0x7BFF); + bool found_out_of_range = false; + for (const auto i : c10::irange(N)) { + bool saturate = CheckAndSaturate(kFp16Max, weight + i); + if (saturate) { + found_out_of_range = true; + } + } + if (found_out_of_range) { + TORCH_WARN("FOUND weight out of range "); + } +} + +// Util function for quantizing bias. +inline at::Tensor QuantizeBias( + bool is_per_channel, + const at::Tensor& bias, + const at::Tensor& weight_contig, + double input_scale) { + at::Tensor qbias; + if (is_per_channel) { + auto bias_quant_scales = + weight_contig.q_per_channel_scales() * input_scale; + auto bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt); + qbias = at::native::quantize_per_channel( + bias, bias_quant_scales, bias_zp, 0, c10::kQInt32); + } else { + qbias = at::native::quantize_per_tensor( + bias, weight_contig.q_scale() * input_scale, 0, c10::kQInt32); + } + return qbias; +} + +} // namespace quant_utils + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h new file mode 100644 index 0000000000000000000000000000000000000000..ccbae78b5516ca6b516c281d054327e3d2bfac13 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h @@ -0,0 +1,287 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include + +namespace at::native { + +using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, + const Scalar& /*negval_*/); +using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */); +using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point); +using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qclamp_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& min, + const Scalar& max, + at::Tensor& /*qy*/); +using qclamp_minmax_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& /*min or max*/, + at::Tensor& /*qy*/); +using qthreshold_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& threshold, + const Scalar& value, + at::Tensor& /*qy*/); +using qtanh_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qelu_fn = void(*)( + const at::Tensor& /*qx*/, + const Scalar& /*alpha*/, + const Scalar& /*scale*/, + const Scalar& /*input_scale*/, + at::Tensor& /*qy*/); +using qbinary_fn = + void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Tensor& /*other*/); +using qadd_scalar_fn = + void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Scalar& other /*other*/); +using qhardswish_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qdropout_fn = void(*)( + const at::Tensor& /*qx*/, + const Scalar& /*p*/, + bool training /*training*/, + at::Tensor& /*qy*/); +using qmaxpool_2d_fn = void (*)( + const Tensor& qx, + int64_t iC, // input/output channels + int64_t iH, + int64_t iW, // input sizes + int64_t oH, + int64_t oW, // output sizes + int64_t kH, + int64_t kW, // kernel size + int64_t sH, + int64_t sW, // strides + int64_t pH, + int64_t pW, // padding + int64_t dH, + int64_t dW, // dilation + Tensor& qy); +using qmaxpool_3d_fn = void (*)( + const Tensor& qx, + int64_t iC, // input/output channels + int64_t iT, + int64_t iH, + int64_t iW, // input sizes + int64_t oT, + int64_t oH, + int64_t oW, // output sizes + int64_t kT, + int64_t kH, + int64_t kW, // kernel size + int64_t sT, + int64_t sH, + int64_t sW, // strides + int64_t pT, + int64_t pH, + int64_t pW, // padding + int64_t dT, + int64_t dH, + int64_t dW, // dilation + Tensor& qy); +using qadaptive_avg_pool2d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t sizeB, + int64_t sizeC, + int64_t isizeH, + int64_t isizeW, + int64_t osizeH, + int64_t osizeW, + int64_t istrideB, + int64_t istrideC, + int64_t istrideH, + int64_t istrideW); +using qadaptive_avg_pool3d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t sizeB, + int64_t sizeC, + int64_t isizeD, + int64_t isizeH, + int64_t isizeW, + int64_t osizeD, + int64_t osizeH, + int64_t osizeW, + int64_t istrideB, + int64_t istrideC, + int64_t istrideD, + int64_t istrideH, + int64_t istrideW); +using qavg_pool2d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t nBatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputWidth, + int64_t outputHeight, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool count_include_pad, + std::optional divisor_override); + +using qavg_pool3d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t nBatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t inputDepth, + int64_t outputWidth, + int64_t outputHeight, + int64_t outputDepth, + int kW, + int kH, + int kD, + int dW, + int dH, + int dD, + int padW, + int padH, + int padD, + bool count_include_pad, + std::optional divisor_override); + +using qupsample_bilinear2d_fn = void (*)( + Tensor& output, + const Tensor& input, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width, + int64_t nbatch, + int64_t channels, + bool align_corners, + std::optional scales_h, + std::optional scales_w); + +using qcat_nhwc_fn = Tensor (*)( + const MaterializedITensorListRef& qxs, + int64_t dim, + double scale, + int64_t zero_point); +using qtopk_fn = void(*)(Tensor&, Tensor&, const Tensor&, int64_t, int64_t, bool, bool); + +using qbatch_norm_fn = void(*)(int64_t, int64_t, int64_t, int64_t, int64_t, const Tensor&, const Tensor&, const Tensor&, Tensor&); + +using qnormalize_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + bool /* affine_per_channel */, + int /* num_channels */, + int /* num_groups */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */); + +using qmean_inner_dim_fn = void (*)( + const Tensor& /* X */, + OptionalIntArrayRef /* opt_dim */, + bool /* keepdim */, + std::optional /* opt_dtype */, + Tensor& /* Y */); + +using qstd_inner_dim_fn = void (*)( + const Tensor& /* X */, + OptionalIntArrayRef /* dim */, + const std::optional& /* correction */, + bool /* keepdim */, + Tensor& /* Y */); + +using qnormalize_nhwc_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + bool /* affine_per_channel */, + int /* num_channels */, + int /* num_groups */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */); + +using qprelu_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, + const Tensor& /*qw*/); + +using qbinary_eltwise_cpu_fn = void (*)( + Tensor& /*out*/, + const Tensor& /*qx*/, + double /*qx_scale*/, + int64_t /*qx_zero_point*/, + const Tensor& /*qy*/, + double /*qy_scale*/, + int64_t /*qy_zero_point*/, + double /*output_scale*/, + int64_t /*output_zero_point*/); + +using qbatch_norm_cpu_fn = void(*)( + int64_t /*N*/, + int64_t /*C*/, + int64_t /*H * W*/, + int64_t /*in_zero_point*/, + int64_t /*out_zero_point*/, + const Tensor& /*input*/, + const Tensor& /*a*/, + const Tensor& /*b*/, + Tensor& /*output*/); + +DECLARE_DISPATCH(qadaptive_avg_pool2d_fn, qadaptive_avg_pool2d_nhwc_stub) +DECLARE_DISPATCH(qadaptive_avg_pool3d_fn, qadaptive_avg_pool3d_ndhwc_stub) +DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_relu_stub) +DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_stub) +DECLARE_DISPATCH(qavg_pool2d_fn, qavg_pool2d_nhwc_stub) +DECLARE_DISPATCH(qavg_pool3d_fn, qavg_pool3d_nhwc_stub) +DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_relu_stub) +DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_stub) +DECLARE_DISPATCH(qbinary_fn, qadd_relu_stub) +DECLARE_DISPATCH(qbinary_fn, qadd_stub) +DECLARE_DISPATCH(qbinary_fn, qmul_relu_stub) +DECLARE_DISPATCH(qbinary_fn, qmul_stub) +DECLARE_DISPATCH(qcat_nhwc_fn, qcat_nhwc_stub) +DECLARE_DISPATCH(qcat_nhwc_fn, qcat_relu_nhwc_stub) +DECLARE_DISPATCH(qclamp_fn, qclamp_stub) +DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_min_stub) +DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_max_stub) +DECLARE_DISPATCH(qelu_fn, qelu_stub) +DECLARE_DISPATCH(qhardsigmoid_fn, qhardsigmoid_stub) +DECLARE_DISPATCH(qhardswish_fn, qhardswish_stub) +DECLARE_DISPATCH(qdropout_fn, qdropout_stub) +DECLARE_DISPATCH(qmaxpool_2d_fn, qmaxpool_2d_nhwc_stub) +DECLARE_DISPATCH(qmaxpool_3d_fn, qmaxpool_3d_nthwc_stub) +DECLARE_DISPATCH(qnormalize_fn, quantized_normalize_stub) +DECLARE_DISPATCH(qnormalize_nhwc_fn, quantized_groupnorm_nhwc_stub) +DECLARE_DISPATCH(qrelu_fn, qrelu_stub) +DECLARE_DISPATCH(qrelu_leaky_fn, qrelu_leaky_stub) +DECLARE_DISPATCH(qgelu_fn, qgelu_stub) +DECLARE_DISPATCH(qsigmoid_fn, qsigmoid_stub) +DECLARE_DISPATCH(qtanh_fn, qtanh_stub) +DECLARE_DISPATCH(qthreshold_fn, qthreshold_stub) +DECLARE_DISPATCH(qtopk_fn, qtopk_stub) +DECLARE_DISPATCH(qupsample_bilinear2d_fn, qupsample_bilinear2d_nhwc_stub) +DECLARE_DISPATCH(qmean_inner_dim_fn, qmean_inner_dim_stub) +DECLARE_DISPATCH(qstd_inner_dim_fn, qstd_inner_dim_stub) +DECLARE_DISPATCH(qprelu_fn, qprelu_stub) +DECLARE_DISPATCH(qbinary_eltwise_cpu_fn, qmul_tensor_cpu_stub) +DECLARE_DISPATCH(qbinary_eltwise_cpu_fn, qadd_tensor_cpu_stub) +DECLARE_DISPATCH(qbinary_eltwise_cpu_fn, qadd_relu_tensor_cpu_stub) +DECLARE_DISPATCH(qbatch_norm_cpu_fn, qbatch_norm_cpu_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..4baae7cf065a3b03d50ea8880af7016a329b9be1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_RUY_QMATMUL + +#include + +namespace at::native::ruy_utils { + +ruy::Context* get_ruy_context(); + +void quantize_multiplier(double scale, + int* multiplier_fixedpoint, + int* multiplier_exponent); + +} // namespace at::native::ruy_utils + +#endif // USE_RUY_QMATMUL + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..36fabe41f6727e178890c80e4df77a11e4daf447 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h @@ -0,0 +1,336 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_XNNPACK +#include + +#include +#include + +using xnnpack_operator = at::native::xnnpack::Operator; + +namespace at::native::xnnp_utils { + +/* + * Return shape in the same order as the memory format + * e.g. channels_last will return NHWC instead of NCHW + */ +std::vector get_mem_format_aware_shape(const at::Tensor& in); + +/* + * Input is always int8_t, output can be [int8_t, uint8_t]. + * input + offset = output + * int8_t + 128 = uint8_t + * int8_t + 0 = int8_t + */ +template +void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out); + +template +Tensor convert_conv_weights_to_channel_last_tensor( + const at::Tensor& src, + int groups, + bool transpose); + +/* + * Series of create wrapper functions to call xnn_create_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_create_convolution2d_nhwc( + uint32_t pad_top, + uint32_t pad_right, + uint32_t pad_bottom, + uint32_t pad_left, + uint32_t kernel_h, + uint32_t kernel_w, + uint32_t stride_h, + uint32_t stride_w, + uint32_t dilation_h, + uint32_t dilation_w, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t ip_chan_stride, + size_t op_chan_stride, + int8_t izp, + float ip_scale, + int8_t kzp, + const float* k_scales, + const int8_t* kernel, + const int32_t* bias, + int8_t ozp, + float op_scale, + int8_t op_min, + int8_t op_max, + uint32_t flags, + xnn_operator_t* op, + bool per_channel, + bool transpose) { + /* Symmetric quantization forces kzp = 0 */ + TORCH_CHECK(!kzp, "XNNPACK Q[SC]8 conv kernels expects kernel zero point to be zero." + "But got: ", kzp); + + if (transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + return xnn_create_deconvolution2d_nhwc_qs8( + pad_top, /* uint32_t output_padding_top */ + pad_right, /* uint32_t output_padding_right */ + pad_bottom, /* uint32_t output_padding_bottom */ + pad_left, /* uint32_t output_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t stride_height */ + stride_w, /* uint32_t stride_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels */ + ip_chan_stride, /* size_t input_pixel_stride */ + op_chan_stride, /* size_t output_pixel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales[0], /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t weights_cache */ + op); /* xnn_operator_t* deconvolution_op_out */ + + } + + if (!per_channel) { + return xnn_create_convolution2d_nhwc_qs8( + pad_top, /* uint32_t input_padding_top */ + pad_right, /* uint32_t input_padding_right */ + pad_bottom, /* uint32_t input_padding_bottom */ + pad_left, /* uint32_t input_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t subsampling_height */ + stride_w, /* uint32_t subsampling_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels*/ + ip_chan_stride, /* size_t input_channel_stride */ + op_chan_stride, /* size_t output_channel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales[0], /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t weights_cache */ + op); /* xnn_operator_t* convolution_op_out */ + } else { /* per_channel */ + return xnn_create_convolution2d_nhwc_qs8_qc8w( + pad_top, /* uint32_t input_padding_top */ + pad_right, /* uint32_t input_padding_right */ + pad_bottom, /* uint32_t input_padding_bottom */ + pad_left, /* uint32_t input_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t subsampling_height */ + stride_w, /* uint32_t subsampling_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels*/ + ip_chan_stride, /* size_t input_channel_stride */ + op_chan_stride, /* size_t output_channel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales, /* const float* kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t weights_cache */ + op); /* xnn_operator_t* convolution_op_out */ + } +} + +/* + * Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_reshape_convolution2d_nhwc( + xnn_operator_t op, + size_t batch, + size_t in_h, + size_t in_w, + pthreadpool_t pt_pool, + bool per_channel = false, + bool transpose = false, + uint32_t adj_h = 0, + uint32_t adj_w = 0) { + if(transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + return xnn_reshape_deconvolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + adj_h, /* uint32_t adjustment_height */ + adj_w, /* uint32_t adjustment_width */ + nullptr, /* size_t* output_height_out */ + nullptr, /* size_t* output_width_out */ + pt_pool); /* pthreadpool_t threadpool */ + } + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + + if (!per_channel) { + return xnn_reshape_convolution2d_nhwc_qs8( + op, /* xnn_operator_t convolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + &workspace_size, /* size_t* workspace_size */ + &workspace_alignment, /* size_t* workspace_alignment */ + nullptr, /* size_t* output_height_out */ + nullptr, /* size_t* output_width_out */ + pt_pool); /* pthreadpool_t threadpool */ + } else { /* per_channel */ + return xnn_reshape_convolution2d_nhwc_qs8_qc8w( + op, /* xnn_operator_t convolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + &workspace_size, /* size_t* workspace_size */ + &workspace_alignment, /* size_t* workspace_alignment */ + nullptr, /* size_t* output_height_out */ + nullptr, /* size_t* output_width_out */ + pt_pool); /* pthreadpool_t threadpool */ + } +} + + +/* + * Series of setup wrapper functions to call xnn_setup_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_setup_convolution2d_nhwc( + xnn_operator_t op, + const int8_t* inp, + int8_t* outp, + bool per_channel = false, + bool transpose = false) { + if(transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + + return xnn_setup_deconvolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + inp, /* const int8_t* input */ + outp); /* int8_t* output */ + } + + if (!per_channel) { + return xnn_setup_convolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + nullptr, /* void workspace */ + inp, /* const int8_t* input */ + outp); /* int8_t* output */ + } else { /* per_channel */ + return xnn_setup_convolution2d_nhwc_qs8_qc8w( + op, /* xnn_operator_t deconvolution_op */ + nullptr, /* void workspace */ + inp, /* const int8_t* input */ + outp); /* int8_t* output */ + } +} + + +/* + * Series of wrapper functions to call xnn_create* and xnn_setup* + * functions for linear + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_create_fully_connected_nc( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + int8_t input_zero_point, + float input_scale, + int8_t kernel_zero_point, + float kernel_scale, + const int8_t* kernel, + const int32_t* bias, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* fully_connected_op_out) { + /* Symmetric quantization forces kzp = 0 */ + TORCH_CHECK(!kernel_zero_point, "XNNPACK QS8 linear kernel expects kernel zero point to be zero." + "But got: ", kernel_zero_point); + return xnn_create_fully_connected_nc_qs8( + input_channels, /* size_t input_channels */ + output_channels, /* size_t output_channels */ + input_stride, /* size_t input_stride */ + output_stride, /* size_t output_stride */ + input_zero_point, /* int8_t input_zero_point */ + input_scale, /* float input_scale */ + kernel_scale, /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + output_zero_point, /* int8_t output_zero_point */ + output_scale, /* float output_scale */ + output_min, /* int8_t output_min */ + output_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t */ + fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */ +} + +C10_ALWAYS_INLINE +enum xnn_status xnnp_reshape_fully_connected_nc( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool) { + return xnn_reshape_fully_connected_nc_qs8( + fully_connected_op, /* xnn_operator_t fully_connected_op */ + batch_size, /* size_t batch_size */ + threadpool); /* pthreadpool_t threadpool */ +} + +C10_ALWAYS_INLINE +enum xnn_status xnnp_setup_fully_connected_nc( + xnn_operator_t fully_connected_op, + const int8_t* input, + int8_t* output) { + return xnn_setup_fully_connected_nc_qs8( + fully_connected_op, /* xnn_operator_t fully_connected_op */ + input, /* const int8_t* input */ + output /* int8_t* output */ + ); +} + +} // namespace at::native::xnnp_utils + +#endif // USE_XNNPACK + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h new file mode 100644 index 0000000000000000000000000000000000000000..72d32a7591d3cafd0a4a20322492ec0378a1857f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h @@ -0,0 +1,422 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#if !defined(__s390x__) && !defined(__powerpc__) +#include +#endif + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + + +#include + +/* Convolution prepacked parameters serialization. + * + * Version 1 + * + * - Fields: + * 1. weight + * 2. bias + * 3. stride x kSpatialDim + * 4. padding x kSpatialDim + * 5. dilation x kSpatialDim + * 6. groups + * + * Version 2 + * + * - Fields: + * 0. version (string) + * 1. list of non-optional tensors + * 0: packed parameters (int16_t) + * - kSpatialDim + * - stride x kSpatialDim + * - padding x kSpatialDim + * - dilation x kSpatialDim + * - output_padding x kSpatialDim + * - groups + * - transpose (0 or 1) + * 1: weight + * 2. list of optional tensors + * 0: bias + * + * Version 3 + * + * - Fields: + * 0. version (int64_t) + * 1. list of int64_t configuration values + * - kSpatialDim + * - stride x kSpatialDim + * - padding x kSpatialDim + * - dilation x kSpatialDim + * - output_padding x kSpatialDim + * - groups + * - flags (bitmask) + * - (1 << 0) transpose (1 = yes) + * 2. list of optional tensors + * 0: None (helps with type inference) + * 1: weight (this must be present) + * 2: bias + */ + +using ConvParamsSerializationTypeV2 = std::tuple< + // version, for versions 2 and up + std::string, + // non-optional tensors + std::vector, + // optional tensors + std::vector>>; + +using ConvParamsSerializationTypeV3 = std::tuple< + // version, int for versions 3 and up + int64_t, + // configuration values + std::vector, + // optional tensors + std::vector>>; + +// Parses any historical conv packed params format into +// the current format. +template +ConvParamsSerializationTypeV3 parse_conv_serialized_state(const c10::IValue& v) { + + // determine the version based on IValue contents + int version = -1; + if (v.isTuple()) { + const auto& elements = v.toTupleRef().elements(); + if (!elements.empty()) { + auto firstElement = elements[0]; + if (firstElement.isTensor()) { + version = 1; + } else if (firstElement.isString()) { + const std::string& version_str = firstElement.toStringRef(); + // note: not parsing the string to automatically handle bad + // inputs + if (version_str == "2") { + version = 2; + } + } else if (firstElement.isInt()) { + auto raw_version = firstElement.toInt(); + if (raw_version == 3) { + version = 3; + } + } + } + } + TORCH_INTERNAL_ASSERT(version != -1, "Unable to parse serialization version"); + + if (version == 1) { + // version 1 - convert to version 3 manually + + const auto& elements = v.toTupleRef().elements(); + + at::Tensor weight = elements[0].toTensor(); + std::optional bias = elements[1].toOptional(); + torch::List stride_x_kSpatialDim = elements[2].toTensorList(); + torch::List padding_x_kSpatialDim = elements[3].toTensorList(); + torch::List dilation_x_kSpatialDim = elements[4].toTensorList(); + at::Tensor groups = elements[5].toTensor(); + + std::vector config_vals; + config_vals.reserve( + stride_x_kSpatialDim.size() + padding_x_kSpatialDim.size() + + dilation_x_kSpatialDim.size() + kSpatialDim + 3); + config_vals.push_back(kSpatialDim); + for (const auto i : c10::irange(stride_x_kSpatialDim.size())) { + auto const & stride = stride_x_kSpatialDim.get(i); + config_vals.push_back(stride[0].item()); + } + for (const auto i : c10::irange(padding_x_kSpatialDim.size())) { + auto const &padding = padding_x_kSpatialDim.get(i); + config_vals.push_back(padding[0].item()); + } + for (const auto i : c10::irange(dilation_x_kSpatialDim.size())) { + auto const &dilation = dilation_x_kSpatialDim.get(i); + config_vals.push_back(dilation[0].item()); + } + // output_padding does not exist in v1, so we fill in a default value + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + config_vals.push_back(0); + } + config_vals.push_back(groups[0].item()); + // transpose does not exist in v1, so we fill in a default value + config_vals.push_back(0); + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); + } else if (version == 2) { + // version 2 + const auto& elements = v.toTupleRef().elements(); + std::vector non_optional = elements[1].toTensorList().vec(); + std::vector> optional; + + if (elements[2].isTensorList()) { + for (const auto& elem : elements[2].toTensorList()) { + optional.emplace_back(static_cast(elem)); + } + } else { + for (const auto& elem : elements[2].toList()) { + optional.emplace_back(static_cast(elem).toOptional()); + } + } + // create default optional value for bias + if (optional.empty()) { + optional.emplace_back(); + } + + auto config_a = non_optional[0].accessor(); + std::vector config_vals; + config_vals.reserve(config_a.size(0)); + for (const auto i : c10::irange(config_a.size(0))) { + config_vals.emplace_back(config_a[i]); + } + + auto weight = non_optional[1]; + auto bias = optional[0]; + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); + } else if (version == 3) { + return v.to(); + } else { + TORCH_INTERNAL_ASSERT(false, "Unexpected serialized qconv version: ", + version); + } +} + +#define QCONV_SERIALIZATION_VERSION 2 + +#if QCONV_SERIALIZATION_VERSION == 2 +using ConvParamsSerializationType = ConvParamsSerializationTypeV2; + +template +ConvParamsSerializationTypeV2 serialize_conv( + const c10::intrusive_ptr>& params) { + + std::string version = "2"; + std::vector non_optional; + std::vector> optional; + + // create a packed int8_t tensor for conv params + std::vector params_vec; + params_vec.push_back(kSpatialDim); + auto stride = params->stride().vec(); + params_vec.insert(params_vec.end(), stride.begin(), stride.end()); + auto padding = params->padding().vec(); + params_vec.insert(params_vec.end(), padding.begin(), padding.end()); + auto dilation = params->dilation().vec(); + params_vec.insert(params_vec.end(), dilation.begin(), dilation.end()); + auto output_padding = params->output_padding().vec(); + params_vec.insert(params_vec.end(), output_padding.begin(), + output_padding.end()); + params_vec.push_back(params->groups()); + params_vec.push_back(params->transpose()); + int64_t vec_size = params_vec.size(); + at::Tensor params_tensor = at::from_blob( + params_vec.data(), {vec_size}, + at::TensorOptions().dtype(at::kShort)) + // clone to retain ownership of the data + .clone(); + + auto [weight, bias] = params->unpack(); + + non_optional.emplace_back(std::move(params_tensor)); + non_optional.emplace_back(std::move(weight)); + optional.emplace_back(std::move(bias)); + + return std::tie(version, non_optional, optional); +} + +#elif QCONV_SERIALIZATION_VERSION == 3 +using ConvParamsSerializationType = ConvParamsSerializationTypeV3; + +template +ConvParamsSerializationTypeV3 serialize_conv( + const c10::intrusive_ptr>& params) { + std::vector config_vals; + config_vals.push_back(kSpatialDim); + auto stride = params->stride().vec(); + config_vals.insert(config_vals.end(), stride.begin(), stride.end()); + auto padding = params->padding().vec(); + config_vals.insert(config_vals.end(), padding.begin(), padding.end()); + auto dilation = params->dilation().vec(); + config_vals.insert(config_vals.end(), dilation.begin(), dilation.end()); + auto output_padding = params->output_padding().vec(); + config_vals.insert(config_vals.end(), output_padding.begin(), + output_padding.end()); + config_vals.push_back(params->groups()); + config_vals.push_back(params->transpose()); + + auto [weight, bias] = params->unpack(); + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); +} + +#else +#error "Invalid qconv serialization version." +#endif + +template +c10::intrusive_ptr> deserialize_conv( + ConvParamsSerializationTypeV3 state) { + auto & [version, config_vals, tensors] = state; + TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version); + + TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size()); + auto & weight = tensors[1]; + auto & bias [[maybe_unused]] = tensors[2]; + TORCH_INTERNAL_ASSERT(weight.has_value(), "Weight should always be present in serialized qconv."); + + torch::List stride, padding, output_padding, dilation; + // skip kSpatialDim + int idx = 1; + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + stride.emplace_back(config_vals.at(idx)); + idx++; + } + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + padding.emplace_back(config_vals.at(idx)); + idx++; + } + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + dilation.emplace_back(config_vals.at(idx)); + idx++; + } + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + TORCH_INTERNAL_ASSERT( + idx < static_cast(config_vals.size()), + "Unexpected index = ", + idx, + " for config_vals of size ", + config_vals.size()); + output_padding.emplace_back(config_vals.at(idx)); + idx++; + } + int64_t groups [[maybe_unused]] = config_vals.at(idx); + idx++; + int64_t flags [[maybe_unused]] = config_vals.at(idx); + idx++; + TORCH_INTERNAL_ASSERT(idx == static_cast(config_vals.size()), + "Unexpected length of config_vals, expected ", + idx, + " got ", + config_vals.size()); + + bool transpose [[maybe_unused]] = flags & (1 << 0); + + int64_t other_flags = flags & ~(1 << 0); + TORCH_INTERNAL_ASSERT(other_flags == 0, "Unexpected flags set in ", flags, "."); + + auto& ctx = at::globalContext(); + +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::X86) { +#if AT_MKLDNN_ENABLED() + bool use_onednn = onednn_utils::should_use_onednn_quant( + weight.value(), transpose, groups, output_padding); + if (use_onednn) { + return PackedConvWeightsOnednn::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif + return PackedConvWeight::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } // x86 +#endif + +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::FBGEMM) { + return PackedConvWeight::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // USE_FBGEMM +#ifdef USE_PYTORCH_QNNPACK + if (ctx.qEngine() == at::QEngine::QNNPACK) { + TORCH_CHECK( + kSpatialDim == 2, + "prepack/__setstate__: QNNPACK only supports Conv2d " + "now."); + return PackedConvWeightsQnnp::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + return PackedConvWeightsOnednn::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // AT_MKLDNN_ENABLED() +TORCH_CHECK( + false, + "Didn't find engine for when deserializing ConvPackedParams: ", + toString(ctx.qEngine())); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..5130c769ffbfd2cd396cd8e41195a722150985d5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h @@ -0,0 +1,415 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#ifdef USE_FBGEMM +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") +#include +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Winconsistent-missing-destructor-override") +#include +C10_DIAGNOSTIC_POP() +#include +C10_DIAGNOSTIC_POP() + +// The struct for the packed weight matrix (PackBMatrix) and the corresponding +// column offsets used for the fully connect layer, which are both prepared in +// the prepacking step to save the computations in the inference. Note the +// column offsets include the sum of the B columns as well as the scalar term +// B_zero_point * K, whereas the row offsets created by +// PackAWithQuantRowOffset/PackAWithIm2Col/PackAWithRowOffset are only the sum +// of the A rows. The column offsets are needed for the asymmetric quantization +// (affine quantization) of input matrix. +// Note that in JIT mode we can think of a way to fuse col_offsets with bias. +struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase { + PackedLinearWeight( + std::unique_ptr> w, + std::optional bias, + std::vector col_offsets, + std::vector w_scale, + std::vector w_zp, + c10::QScheme q_scheme) + : w(std::move(w)), + bias_(std::move(bias)), + col_offsets(std::move(col_offsets)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + q_scheme(std::move(q_scheme)) {} + std::unique_ptr> w; + std::optional bias_; + std::vector col_offsets; + std::vector w_scale; + std::vector w_zp; + c10::QScheme q_scheme; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor& apply_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor& apply_relu_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + private: + template + at::Tensor& apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output); + + template + at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32_impl( + const at::Tensor& input, + double input_scale, + int64_t input_zero_point); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false); +}; + +struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase { + PackedLinearWeightFp16( + std::unique_ptr w, + std::optional bias) + : w(std::move(w)), bias_(std::move(bias)) {} + + std::unique_ptr w; + std::optional bias_; + + at::Tensor apply( + at::Tensor /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/) override { + TORCH_INTERNAL_ASSERT(false); + } + at::Tensor apply_relu( + at::Tensor /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/) override { + TORCH_INTERNAL_ASSERT(false); + } + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor& apply_dynamic_out( + const at::Tensor& input, + at::Tensor& output, + bool reduce_range = false) override; + at::Tensor& apply_dynamic_relu_out( + const at::Tensor& input, + at::Tensor& output, + bool reduce_range = false) override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + void set_bias(std::optional bias) override; + + private: + template + at::Tensor& apply_dynamic_impl(const at::Tensor& input, at::Tensor& output); +}; + +template +struct TORCH_API PackedConvWeight : public ConvPackedParamsBase { + PackedConvWeight( + std::unique_ptr> w, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + uint8_t transpose, + std::vector col_offsets, + std::vector kernel, + std::vector w_scale, + std::vector w_zp, + c10::QScheme q_scheme) + : w(std::move(w)), + bias(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose), + col_offsets(std::move(col_offsets)), + kernel(std::move(kernel)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + q_scheme(q_scheme) {} + + std::unique_ptr> w; + std::optional bias; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + uint8_t transpose_; + std::vector col_offsets; + std::vector kernel; + std::vector w_scale; + std::vector w_zp; + c10::QScheme q_scheme; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override; + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + const float* GetBiasData(at::Tensor* bias); + + void GetQuantizationParams( + float act_scale, + float out_scale, + std::vector* output_multiplier_float, + std::vector* act_times_w_scale); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return (bool)transpose_; + } + + private: + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +}; + +// PackWeight: Convert the weight from uint8 to int8. +inline void convert_uint8_int8( + int len, + const uint8_t* src_uint8, + int8_t* dst_int8) { + for (const auto i : c10::irange(len)) { + dst_int8[i] = static_cast(static_cast(src_uint8[i]) - 128); + } +} + +// UnpackWeight: Convert the weight from int8 to uint8. +inline void convert_int8_uint8( + int len, + const int8_t* src_int8, + uint8_t* dst_uint8) { + for (const auto i : c10::irange(len)) { + dst_uint8[i] = + static_cast(static_cast(src_int8[i]) + 128); + } +} + +namespace at::native::fbgemm_utils { + +template +fbgemm::conv_param_t MakeFbgemmConvParam( + int N, + int C, + int M, + const std::vector& image_shape, + int groups, + const std::vector& kernels, + const std::vector& strides, + const std::vector& pads, + const std::vector& dilations, + const std::vector& output_padding = std::vector(kSpatialDim, 0), + bool transposed = false); + +// TODO: Remove functions below when ChannelsLast3d is ready. +Tensor MakeStridedQTensorCPU( + const IntArrayRef& sizes, + const IntArrayRef& strides, + const TensorOptions& options, + QuantizerPtr quantizer); + +Tensor MakeEmptyAffineQuantizedChannelsLast3dTensor( + int64_t N, + int64_t C, + int64_t D, + int64_t H, + int64_t W, + const TensorOptions& options, + double scale, + int64_t zero_point); + +Tensor MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor( + int64_t N, + int64_t C, + int64_t D, + int64_t H, + int64_t W, + const TensorOptions& options, + const Tensor& scales, + const Tensor& zero_points); + +Tensor ConvertToChannelsLast3dTensor(const Tensor& src); + +template +Tensor TransposeConvTensorUnpackConversion(const Tensor& src, int groups); + +template +Tensor ConvertConvWeightsToChannelLastTensor( + const at::Tensor& src, + int groups, + bool transpose); +} // at::native::namespace fbgemm_utils + +#endif // USE_FBGEMM + +struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase { + PackedEmbeddingBagWeight( + at::Tensor packed_w, + std::vector w_scale, + std::vector w_zp, + int64_t bit_rate, + c10::QScheme q_scheme, + int64_t version) + : packed_w(std::move(packed_w)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + bit_rate_(bit_rate), + q_scheme(q_scheme), + version_(version) { + if (!this->packed_w.is_contiguous()) { + this->packed_w = this->packed_w.contiguous(); + } + } + + at::Tensor packed_w; + std::vector w_scale; + std::vector w_zp; + int64_t bit_rate_; + c10::QScheme q_scheme; + int64_t version_; + + at::Tensor unpack() override; + static c10::intrusive_ptr prepack( + const at::Tensor& weight); + + int64_t bit_rate() const override { + return bit_rate_; + } + + int64_t version() const override { + return version_; + } + + at::Tensor embeddingbag_byte( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) override; + + at::Tensor embeddingbag_4bit( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) override; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h new file mode 100644 index 0000000000000000000000000000000000000000..b6d62a7a085864da1194565fb8a34ccf03632342 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h @@ -0,0 +1,16 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_PYTORCH_QNNPACK + +namespace at::native { + +void initQNNPACK(); + +} // namespace at::native + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qconv.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qconv.h new file mode 100644 index 0000000000000000000000000000000000000000..bb480f4868ea9ee7517e7eb2d98daad9f239da48 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qconv.h @@ -0,0 +1,105 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +namespace native { + +class QConvoneDNN final { + public: + + C10_API static at::Tensor run_pointwise( + at::Tensor act, // contains quantized values but not QTensor + double act_scale, + int64_t act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_tensor( + at::Tensor act, // contains quantized values but not QTensor + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_binary( + at::Tensor act, // contains quantized values but not QTensor + double act_scale, + int64_t act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, // contains quantized values but not QTensor + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + C10_API static at::Tensor run_pointwise_binary_tensor( + at::Tensor act, // contains quantized values but not QTensor + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, // contains quantized values but not QTensor + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h new file mode 100644 index 0000000000000000000000000000000000000000..8f474b79d46f4ad2791cb28f34ca3530a363a40e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { +Tensor& embedding_bag_byte_rowwise_offsets_out( + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const std::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset); + +Tensor& embedding_bag_4bit_rowwise_offsets_out( + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const std::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset); + +Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weight); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h new file mode 100644 index 0000000000000000000000000000000000000000..f581236788a75dd17bfc6b9bec831571c501fbeb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h @@ -0,0 +1,20 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +Tensor& qembeddingbag_byte_prepack_out( + Tensor& output, + const Tensor& weight, + const std::optional& rowwise_min_max_opt = std::nullopt); + +Tensor qembeddingbag_byte_prepack(const Tensor& weight); + +Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qlinear.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qlinear.h new file mode 100644 index 0000000000000000000000000000000000000000..51d3079767fe6ec32d05194c2576d158bc07c3c0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qlinear.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +class QLinearOnednn final { + public: + C10_API static Tensor run_pointwise_tensor( + Tensor act, // int8 CPU tensor, not QTensor + Tensor act_scale, + Tensor act_zero_point, + Tensor onednn_weight, // int8 tensor from MkldnnCPU + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + c10::List> post_op_args, + std::string_view post_op_algorithm); + +C10_API static Tensor run_pointwise_binary_tensor( + Tensor act, // int8 CPU tensor, not QTensor + Tensor act_scale, + Tensor act_zero_point, + Tensor onednn_weight, // int8 tensor from MkldnnCPU + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, // extra input for binary post-op + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, // e.g. "none", "sum", "add" + double binary_alpha, + std::string_view unary_post_op, // e.g. "none", "relu" + c10::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); +}; + +C10_API Tensor _weight_int4pack_mm_cpu_tensor( + const Tensor& A, + const Tensor& B, + const Tensor& qGroupSize, + const Tensor& qScaleAndZeros); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cudnn/utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cudnn/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..fe55d839b2678abb11bd83054279b37c83234e0e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cudnn/utils.h @@ -0,0 +1,320 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +/* +This file contains some of the auxiliary functions used by both Conv.cpp & Linear.cpp (introduced in a later PR) +*/ + +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + +#include +#include +#include +#include +#include +#include + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wsuggest-override") +#include +C10_DIAGNOSTIC_POP() + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +struct PackedLinearWeightCudnn : public LinearPackedParamsBase { + PackedLinearWeightCudnn( + at::Tensor orig_weight, + std::optional bias, + c10::QScheme q_scheme) + : orig_weight(std::move(orig_weight)), + bias_(std::move(bias)), + q_scheme(std::move(q_scheme)) {} + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) override { + TORCH_CHECK(false, "apply_dynamic is not implemented for this packed parameter type"); + } + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) override { + TORCH_CHECK(false, "apply_dynamic_relu is not implemented for this packed parameter type"); + } + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + private: + at::Tensor orig_weight; + std::optional bias_; + c10::QScheme q_scheme; + + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + + template + void apply_impl_helper( + const at::Tensor& quantized_output, + const at::Tensor& input, + double output_scale); +}; + +template +struct PackedConvWeightCudnn : public ConvPackedParamsBase { + PackedConvWeightCudnn( + at::Tensor orig_weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose, + c10::QScheme q_scheme, + int64_t output_channels) + : maybe_padded_weight_(std::move(orig_weight)), + bias_(std::move(bias)), + stride_(stride), + padding_(padding), + output_padding_(output_padding), + dilation_(dilation), + groups_(groups), + transpose_(transpose), + q_scheme_(q_scheme), + num_unpadded_output_channels_(output_channels) {} // output channels needs to be stored when we have to pad this dimension + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override { + TORCH_CHECK(false, "apply_dynamic is currently not reported"); + } + + at::Tensor apply_dynamic_relu( + const at::Tensor& input, + bool reduce_range) { + TORCH_CHECK(false, "apply_dynamic_relu is currently not reported"); + } + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + const float* GetBiasData(at::Tensor* bias); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return transpose_; + } + + private: + // cudnn v8.4.0 expects conv2d's int8 weight tensor's input and output channels to be a multiple of 4. if it is not + // we need to explicitly pad it to a multiple of 4 ourselves as cudnn does not currently support padding, hence the naming + // convention "maybe"_padded_weight. + // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end and rename this to orig_weight_ + at::Tensor maybe_padded_weight_; + std::optional bias_; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + bool transpose_; + c10::QScheme q_scheme_; + int64_t num_unpadded_output_channels_; + + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + + template + void apply_impl_helper( + const at::Tensor& quantized_output, + const at::Tensor& input, + double output_scale); +}; + +namespace cudnn_utils { + +// TODO: we can remove this function when cuDNN enables pass by value support for +// pointwise multiplication operations. the only reason why we need this right now is +// we use broadcasting scalar multiplication in conv, linear, and add ops, and cuDNN requires +// the scalar to be a scalar tensor with the same number of dimensions (num_dim) as the tensor we're multiplying to +inline at::Tensor getRequantMultiplierTensor(double requant_multiplier, uint8_t num_dim) { + at::SmallVector requantize_multiplier_tensor_size(num_dim, 1); + at::Tensor requantize_multiplier_tensor = at::empty(requantize_multiplier_tensor_size, at::device(at::kCUDA).dtype(at::kFloat)); + requantize_multiplier_tensor.fill_(requant_multiplier); + return requantize_multiplier_tensor; +} + +inline uint8_t getAlignment(const at::Tensor &t) { + // alignment are in bytes + uint8_t alignment = 1; + uintptr_t address = reinterpret_cast(t.data_ptr()); + for (; alignment < 16; alignment *= 2) { + if (address % (alignment * 2)) { + return alignment; + } + } + return alignment; +} + +// For the two getTensorDescriptor functions, there is a is_virtual parameter. This parameter is used to set the cudnn +// tensor as virtual or not. Setting the tensor as virtual is expected to have some performance benefits as the cudnn +// backend cudnn will no longer directly save to the tensor, allowing us to omit this tensor from the variant pack. +// See third_party/cudnn_frontend/samples/fusion_sample.cpp for other examples + +inline cudnn_frontend::Tensor getTensorDescriptor(const at::Tensor &t, int64_t id, uint8_t alignment, bool is_virtual = false) { + auto shape = t.sizes(); + auto strides = t.strides(); + if (is_virtual) { + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setVirtual() + .setDataType(at::native::getCudnnDataType(t)) + .build(); + } + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(at::native::getCudnnDataType(t)) + .build(); +} + +inline cudnn_frontend::Tensor getTensorDescriptor(const c10::IntArrayRef& shape, const c10::IntArrayRef& strides, cudnnDataType_t cudnn_dtype, int64_t id, uint8_t alignment, bool is_virtual = false) { + if (is_virtual) { + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setVirtual() + .setDataType(cudnn_dtype) + .build(); + } + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(cudnn_dtype) + .build(); +} + +// TODO: there is a table from input dtype to operator dtype, we can derive +// the operator dtype based on input dtype +inline cudnn_frontend::PointWiseDesc_v8 getPointWiseMulDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::PointWiseDescBuilder() + .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_MUL) + .setMathPrecision(dataType) + .build(); +} + +// TODO: there is a table from input dtype to operator dtype, we can derive +// the operator dtype based on input dtype +inline cudnn_frontend::PointWiseDesc_v8 getPointWiseAddDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::PointWiseDescBuilder() + .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_ADD) + .setMathPrecision(dataType) + .build(); +} + +// TODO: there is a table from input dtype to operator dtype, we can derive +// the operator dtype based on input dtype +inline cudnn_frontend::PointWiseDesc_v8 getPointWiseReluDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::PointWiseDescBuilder() + .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_RELU_FWD) + .setMathPrecision(dataType) + .build(); +} + + +inline void filterEngineConfigs( + cudnn_frontend::EngineConfigList &from, + cudnn_frontend::EngineConfigList &to, + bool deterministic, bool allow_tf32, c10::ScalarType scalar_type) +{ + auto filter = [=](cudnnBackendDescriptor_t c) { + if (deterministic) { + if (cudnn_frontend::hasNumericalNote(c)) return true; + } + if (scalar_type == at::kFloat || scalar_type == at::kChar || !allow_tf32) { + if (cudnn_frontend::hasNumericalNote(c)) return true; + if (cudnn_frontend::hasNumericalNote(c)) return true; + } + return false; + }; + cudnn_frontend::filter(from, to, filter); +} + +} // cudnn_utils + +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/library.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/library.h new file mode 100644 index 0000000000000000000000000000000000000000..892bc13e4ec0f2b817279739c089eef14b420a8f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/quantized/library.h @@ -0,0 +1,13 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +TORCH_API int register_linear_params(); +int register_embedding_params(); + +template TORCH_API int register_conv_params(); + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/attention.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/attention.h new file mode 100644 index 0000000000000000000000000000000000000000..01af728fa9edf7729a8973d5eef435473be1268c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/attention.h @@ -0,0 +1,75 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include + +namespace at::native { + +using fused_sdp_choice_fn = int64_t (*)(const Tensor& query_, const Tensor& key, const Tensor& value, + const std::optional& attn_mask_, double dropout_p, bool is_causal, std::optional scale, bool enable_gqa); + +DECLARE_DISPATCH(fused_sdp_choice_fn, _fused_sdp_choice_stub) + +TORCH_API Tensor bmm_nt(const Tensor& a, const Tensor& b); +TORCH_API Tensor masked_softmax( + Tensor& attn_scores, + std::optional attn_mask, + const Tensor& query, + std::optional mask_type = {}); + +using transform_bias_rescale_qkv_fn = void(*)( + at::ScalarType type, + void* _q_k_v, + const void* _qkv, + const void* _qkv_bias, + int64_t B, + int64_t T, + int64_t D, + int64_t num_head); + +DECLARE_DISPATCH(transform_bias_rescale_qkv_fn, transform_bias_rescale_qkv_stub) + +TORCH_API Tensor transform0213_gemm_nt_bias( + const Tensor& a, + const Tensor& b, + const Tensor& c, + const Tensor& query); + +TORCH_API Tensor bmm_nn(Tensor& out, const Tensor& a, const Tensor& b); + +TORCH_API void debug_assert_shape(int line, const Tensor& t, c10::IntArrayRef shape); + +TORCH_API Tensor qkv_projection( + const Tensor& query, + const Tensor& key, + const Tensor& value, + const int64_t embed_dim, + const Tensor& qkv_weight); + +using flash_attention_fn = void (*)( + const Tensor& output, const Tensor& logsumexp, + const Tensor& query, const Tensor& key, const Tensor& value, + double dropout_p, bool is_causal, + std::optional attn_mask, + std::optional scale); + +using flash_attention_backward_fn = void (*)( + const Tensor& grad_q, const Tensor& grad_k, + const Tensor& grad_v, const Tensor& grad_out, + const Tensor& query, const Tensor& key, + const Tensor& value, const Tensor& out, const Tensor& logsumexp, + double dropout_p, bool is_causal, + std::optional attn_mask, + std::optional scale); + +DECLARE_DISPATCH(flash_attention_fn, flash_attention_kernel) +DECLARE_DISPATCH(flash_attention_backward_fn, flash_attention_backward_kernel) + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/flash_api.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/flash_api.h new file mode 100644 index 0000000000000000000000000000000000000000..84dc545da12a5e298057e73cf0fd339cebc6c88a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/flash_api.h @@ -0,0 +1,101 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#include +#include +#include + +namespace FLASH_NAMESPACE { + +TORCH_API +std::tuple +mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &out_, // batch_size x seqlen_q x num_heads x head_size + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, + const float softmax_scale, + bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool return_softmax, + std::optional gen_); + +std::tuple +mha_varlen_fwd(const at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + std::optional &seqused_k, // b. If given, only this many elements of each batch element's keys are used. + std::optional &block_table_, // batch_size x max_num_blocks_per_seq + std::optional &alibi_slopes_, // num_heads or b x num_heads + int max_seqlen_q, + const int max_seqlen_k, + const float p_dropout, + const float softmax_scale, + const bool zero_tensors, + bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool return_softmax, + std::optional gen_); + + +std::tuple +mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og + const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &softmax_lse, // b x h x seqlen_q + std::optional &dq_, // batch_size x seqlen_q x num_heads x head_size + std::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, // probability to drop + const float softmax_scale, + const bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool deterministic, + const at::Tensor philox_seed, + const at::Tensor philox_offset); + +std::tuple +mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size + const at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &out, // total_q x num_heads x head_size + const at::Tensor &softmax_lse, // b x h x s softmax logsumexp + std::optional &dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + std::optional &dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + std::optional &alibi_slopes_, // num_heads or b x num_heads + const int max_seqlen_q, + const int max_seqlen_k, // max sequence length to choose the kernel + const float p_dropout, // probability to drop + const float softmax_scale, + const bool zero_tensors, + const bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool deterministic, + const at::Tensor philox_seed, + const at::Tensor philox_offset); + +} // namespace FLASH_NAMESPACE + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/static_switch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/static_switch.h new file mode 100644 index 0000000000000000000000000000000000000000..7d9c91b023a6f06cb54ae8b381c40eeda456f121 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/static_switch.h @@ -0,0 +1,112 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Inspired by +// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h +// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h + +#pragma once + +/// @param COND - a boolean expression to switch by +/// @param CONST_NAME - a name given for the constexpr bool variable. +/// @param ... - code to execute for true and false +/// +/// Usage: +/// ``` +/// BOOL_SWITCH(flag, BoolConst, [&] { +/// some_function(...); +/// }); +/// ``` + +#define BOOL_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + if (COND) { \ + constexpr static bool CONST_NAME = true; \ + return __VA_ARGS__(); \ + } else { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + } \ + }() + +#ifdef FLASHATTENTION_DISABLE_DROPOUT + #define DROPOUT_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + }() +#else + #define DROPOUT_SWITCH BOOL_SWITCH +#endif + +#ifdef FLASHATTENTION_DISABLE_ALIBI + #define ALIBI_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + }() +#else + #define ALIBI_SWITCH BOOL_SWITCH +#endif + +#ifdef FLASHATTENTION_DISABLE_UNEVEN_K + #define EVENK_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + constexpr static bool CONST_NAME = true; \ + return __VA_ARGS__(); \ + }() +#else + #define EVENK_SWITCH BOOL_SWITCH +#endif + +#ifdef FLASHATTENTION_DISABLE_LOCAL + #define LOCAL_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + }() +#else + #define LOCAL_SWITCH BOOL_SWITCH +#endif + +#define FP16_SWITCH(COND, ...) \ + [&] { \ + if (COND) { \ + using elem_type = cutlass::half_t; \ + return __VA_ARGS__(); \ + } else { \ + using elem_type = cutlass::bfloat16_t; \ + return __VA_ARGS__(); \ + } \ + }() + +#define HEADDIM_SWITCH(HEADDIM, ...) \ + [&] { \ + if (HEADDIM <= 32) { \ + constexpr static int kHeadDim = 32; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 64) { \ + constexpr static int kHeadDim = 64; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 96) { \ + constexpr static int kHeadDim = 96; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 128) { \ + constexpr static int kHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 160) { \ + constexpr static int kHeadDim = 160; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 192) { \ + constexpr static int kHeadDim = 192; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 224) { \ + constexpr static int kHeadDim = 224; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 256) { \ + constexpr static int kHeadDim = 256; \ + return __VA_ARGS__(); \ + } \ + }() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/debug_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/debug_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f1f04665613538d11b33a6ea9ecb351b8fda11da --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/debug_utils.h @@ -0,0 +1,215 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// +// Debugging functions +//////////////////////////////////////////////////////////////////////////////// +// Nans & inf detection +#define NANCHECK(frag) \ + { \ + for (int _i = 0; _i < frag.size(); ++_i) { \ + assert(std::isfinite(float(frag[_i]))); \ + assert(!std::isnan(float(frag[_i]))); \ + } \ + } + +// Print on the first thread of the first block +#if 1 +#define PRINT_WARP_ID 0 +#define PRINT_LANE_ID 0 +#define PRINT_B0_T0(msg, ...) \ + if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && \ + threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \ + threadIdx.z == 0) { \ + printf(msg "\n", ##__VA_ARGS__); \ + } +#define PRINT_T0(msg, ...) \ + if (threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \ + threadIdx.z == 0) { \ + printf(msg "\n", ##__VA_ARGS__); \ + } +#define PRINT_TX_LX(msg, ...) \ + for (int bx = 0; bx < gridDim.x; ++bx) { \ + for (int by = 0; by < gridDim.y; ++by) { \ + for (int bz = 0; bz < gridDim.z; ++bz) { \ + for (int tx = 0; tx < blockDim.x; ++tx) { \ + for (int ty = 0; ty < blockDim.y; ++ty) { \ + for (int tz = 0; tz < blockDim.z; ++tz) { \ + __syncthreads(); \ + if (blockIdx.x == bx && blockIdx.y == by && blockIdx.z == bz && \ + threadIdx.x == tx && threadIdx.y == ty && \ + threadIdx.z == tz) { \ + printf( \ + "[%d,%d,%d][%d,%d,%d]" msg "\n", \ + bx, \ + by, \ + bz, \ + tx, \ + ty, \ + tz, \ + ##__VA_ARGS__); \ + } \ + } \ + } \ + } \ + } \ + } \ + } +#else +#define PRINT_B0_T0 +#define PRINT_TX_LX +#endif + +struct __string_view { + char const* data; + std::size_t size; +}; +#if __cplusplus >= 201402L +template +constexpr __string_view __get_type_name() { + char const* p = __PRETTY_FUNCTION__; + while (*p++ != '=') + ; + for (; *p == ' '; ++p) + ; + char const* p2 = p; + int count = 1; + for (;; ++p2) { + switch (*p2) { + case '[': + ++count; + break; + case ']': + --count; + if (!count) + return {p, std::size_t(p2 - p)}; + } + } + return {}; +} +#else +template +constexpr __string_view __get_type_name() { + return {"unsupported", 11}; +} +#endif + +// Print a given array +#define PRINT_ACCUM8_T0_L0_START(name, accum, start) \ + PRINT_B0_T0( \ + "%s[%d:%d] - {%f, %f, %f, %f, %f, %f, %f, %f}", \ + name, \ + int(start), \ + int(start + 8), \ + float(accum[start + 0]), \ + float(accum[start + 1]), \ + float(accum[start + 2]), \ + float(accum[start + 3]), \ + float(accum[start + 4]), \ + float(accum[start + 5]), \ + float(accum[start + 6]), \ + float(accum[start + 7])); +#define PRINT_ACCUM8_T0_L0(name, accum) PRINT_ACCUM8_T0_L0_START(name, accum, 0) +#define PRINT_FRAG_T0_L0(name, frag) \ + { \ + auto typeStr = __get_type_name(); \ + PRINT_B0_T0("printing %s (%s)", name, typeStr.data); \ + for (int _start = 0; _start < frag.size(); _start += 8) { \ + PRINT_ACCUM8_T0_L0_START(" ", frag, _start); \ + } \ + /*__syncthreads(); \ + NANCHECK(frag); */ \ + } +#define PRINT_ARRAY_T0_L0_INCR(name, array, length, incr) \ + { \ + PRINT_B0_T0("printing %s (len=%d)", name, int(length)); \ + for (int _start = 0; _start < length; _start += incr) { \ + PRINT_ACCUM8_T0_L0_START(" ", array, _start); \ + } \ + } +#define PRINT_ARRAY_T0_L0(name, array, length) \ + PRINT_ARRAY_T0_L0_INCR(name, array, length, 8) + +// Print a 4x4 matrix +#define PRINT_TENSOR4x4_T0_L0_START(name, ref, start_x, start_y) \ + PRINT_B0_T0( \ + "%s[%d:%d, %d:%d]:\n %f, %f, %f, %f\n %f, %f, %f, %f\n %f, %f, %f, %f\n %f, %f, %f, %f", \ + name, \ + int(start_x), \ + int(start_x + 4), \ + int(start_y), \ + int(start_y + 4), \ + float(ref.at({start_x + 0, start_y + 0})), \ + float(ref.at({start_x + 0, start_y + 1})), \ + float(ref.at({start_x + 0, start_y + 2})), \ + float(ref.at({start_x + 0, start_y + 3})), \ + float(ref.at({start_x + 1, start_y + 0})), \ + float(ref.at({start_x + 1, start_y + 1})), \ + float(ref.at({start_x + 1, start_y + 2})), \ + float(ref.at({start_x + 1, start_y + 3})), \ + float(ref.at({start_x + 2, start_y + 0})), \ + float(ref.at({start_x + 2, start_y + 1})), \ + float(ref.at({start_x + 2, start_y + 2})), \ + float(ref.at({start_x + 2, start_y + 3})), \ + float(ref.at({start_x + 3, start_y + 0})), \ + float(ref.at({start_x + 3, start_y + 1})), \ + float(ref.at({start_x + 3, start_y + 2})), \ + float(ref.at({start_x + 3, start_y + 3}))); +#define PRINT_TENSOR4x4_T0_L0(name, ref) \ + PRINT_TENSOR4x4_T0_L0_START(name, ref, 0, 0) + +#define PRINT_PROBLEM_SIZE(name, ps) \ + PRINT_B0_T0( \ + "%s.problem_size: {.m=%d, .n=%d, .k=%d}", \ + name, \ + int(ps.m()), \ + int(ps.n()), \ + int(ps.k())) + +template +CUTLASS_DEVICE void print_warp_accum( + AccumT accum, + LaneOffsetT lane_offset, + int32_t num_rows, + int32_t num_cols) { + bool is_main = blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && + threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0; + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + if (col % 32 == 0) { + if (is_main) { + printf("\nmat[%3d, %3d:%3d]", row, col, col + 32); + } + __syncthreads(); + } + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (row == accum_m && col == accum_n && + (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0)) { + printf(" %6.1f", float(accum[idx])); + } + }, + [&](int accum_m) {}); + __syncthreads(); + } + if (is_main) { + printf("\n"); + } + } +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma.h new file mode 100644 index 0000000000000000000000000000000000000000..3da0bfc9d2e5634dbf97c85f91d13423e5962036 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma.h @@ -0,0 +1,105 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include + +#include +#include +template +struct MakeCustomMma; + +template < + typename Shape, + typename IteratorA, + typename SmemIteratorA, + cutlass::arch::CacheOperation::Kind CacheOpA, + typename IteratorB, + typename SmemIteratorB, + cutlass::arch::CacheOperation::Kind CacheOpB, + typename ElementC, + typename LayoutC, + typename Policy, + int Stages, + cutlass::gemm::SharedMemoryClearOption SharedMemoryClear, + int kMaxK> +struct MakeCustomMma< + cutlass::gemm::threadblock::MmaMultistage< + Shape, + IteratorA, + SmemIteratorA, + CacheOpA, + IteratorB, + SmemIteratorB, + CacheOpB, + ElementC, + LayoutC, + Policy, + Stages, + SharedMemoryClear>, + kMaxK> { + // Reduce the number of stages if we don't need that many + static int constexpr kStages = + kMaxK == cutlass::platform::numeric_limits::max() + ? Stages + : cutlass::const_min( + Stages, + (kMaxK + int(Shape::kK) - 1) / int(Shape::kK)); + using Mma = cutlass::gemm::threadblock::CustomMmaMultistage< + Shape, + IteratorA, + SmemIteratorA, + CacheOpA, + IteratorB, + SmemIteratorB, + CacheOpB, + ElementC, + LayoutC, + Policy, + kStages, + SharedMemoryClear, + kMaxK>; +}; + +template < + typename Shape, + typename IteratorA, + typename SmemIteratorA, + typename IteratorB, + typename SmemIteratorB, + typename ElementC, + typename LayoutC, + typename Policy, + int kMaxK> +struct MakeCustomMma< + cutlass::gemm::threadblock::MmaPipelined< + Shape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + Policy>, + kMaxK> { + using Mma = cutlass::gemm::threadblock::CustomMmaPipelined< + Shape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + Policy>; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h new file mode 100644 index 0000000000000000000000000000000000000000..d0ef20a972efe604f0ff2c8bc47c810b2d0ced34 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h @@ -0,0 +1,188 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Used for partial specialization + typename Enable = bool> +class CustomMmaBase { + public: + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + + ///< Policy describing tuning details + using Policy = Policy_; + + // + // Dependent types + // + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Shape describing the overall GEMM computed from shared memory + /// by each warp. + using WarpGemm = typename Policy::Operator::Shape; + + /// Shape describing the number of warps filling the CTA + using WarpCount = GemmShape< + Shape::kM / WarpGemm::kM, + Shape::kN / WarpGemm::kN, + Shape::kK / WarpGemm::kK>; + + /// Number of warp-level GEMM operations + static int const kWarpGemmIterations = + (WarpGemm::kK / Operator::Policy::MmaShape::kK); + + /// Number of stages + static int const kStages = Stages; + + // + // Nested structs + // + + /// Shared storage object needed by threadblock-scoped GEMM + template + struct OperandSharedStorage { + AlignedBuffer buffer; + using TensorRef = TensorRef; + + CUTLASS_DEVICE + static OperandLayout Layout() { + return OperandLayout::packed({OperandShape::kRow, OperandShape::kColumn}); + } + + /// Returns a TensorRef to the operand + CUTLASS_HOST_DEVICE + TensorRef ref() { + return TensorRef{buffer.data(), Layout()}; + } + }; + + /// Shape of the A matrix operand in shared memory + using ShapeA = MatrixShape< + Shape::kM + Policy::SmemPaddingA::kRow, + Shape::kK * kStages + Policy::SmemPaddingA::kColumn>; + + /// Shape of the B matrix operand in shared memory + using ShapeB = MatrixShape< + Shape::kK * kStages + Policy::SmemPaddingB::kRow, + Shape::kN + Policy::SmemPaddingB::kColumn>; + + using SharedStorageA = OperandSharedStorage< + typename Operator::ElementA, + ShapeA, + typename Operator::LayoutA>; + using SharedStorageB = OperandSharedStorage< + typename Operator::ElementB, + ShapeB, + typename Operator::LayoutB>; + using TensorRefA = typename SharedStorageA::TensorRef; + using TensorRefB = typename SharedStorageB::TensorRef; + + struct SharedStorage { + /// Buffer for A operand + SharedStorageA operand_A; + + /// Buffer for B operand + SharedStorageB operand_B; + }; + + protected: + // + // Data members + // + + /// Iterator to load a warp-scoped tile of A operand from shared memory + typename Operator::IteratorA warp_tile_iterator_A_; + + /// Iterator to load a warp-scoped tile of B operand from shared memory + typename Operator::IteratorB warp_tile_iterator_B_; + + public: + /// Construct from tensor references + CUTLASS_DEVICE + CustomMmaBase( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + SharedStorageA& shared_storageA, + SharedStorageB& shared_storageB, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : warp_tile_iterator_A_(shared_storageA.ref(), lane_idx), + warp_tile_iterator_B_(shared_storageB.ref(), lane_idx) {} +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_multistage.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_multistage.h new file mode 100644 index 0000000000000000000000000000000000000000..137624bdfb00d9320ec9787400051f34c1fe5180 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_multistage.h @@ -0,0 +1,773 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Use zfill or predicate for out-of-bound cp.async + SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone, + /// Upper boundon the K dimension + int kMaxK = cutlass::platform::numeric_limits::max(), + /// Used for partial specialization + typename Enable = bool> +class CustomMmaMultistage : public CustomMmaBase { + public: + ///< Base class + using Base = CustomMmaBase; + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + ///< Iterates over tiles of A operand in global memory + using IteratorA = IteratorA_; + ///< Iterates over tiles of B operand in global memory + using IteratorB = IteratorB_; + ///< Data type of accumulator matrix + using ElementC = ElementC_; + ///< Layout of accumulator matrix + using LayoutC = LayoutC_; + ///< Policy describing tuning details + using Policy = Policy_; + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Minimum architecture is Sm80 to support cp.async + using ArchTag = arch::Sm80; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = Operator::kTransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + + /// Internal structure exposed for introspection. + struct Detail { + static_assert( + Base::kWarpGemmIterations > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of cp.async instructions to load one stage of operand A + static int const AsyncCopyIterationsPerStageA = + IteratorA::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load one stage of operand B + static int const AsyncCopyIterationsPerStageB = + IteratorB::ThreadMap::Iterations::kCount; + + /// Number of stages + static int const kStages = Stages; + + /// Number of cp.async instructions to load on group of operand A + static int const kAccessesPerGroupA = + (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / + Base::kWarpGemmIterations; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB = + (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / + Base::kWarpGemmIterations; + }; + + static bool const kSmemContainsEntireMat = kMaxK <= Shape::kK * Stages; + static constexpr int kNumStagesConcurrentLoad = + kSmemContainsEntireMat ? Stages : Stages - 1; + + private: + using WarpLoadedFragmentA = typename Operator::FragmentA; + using WarpLoadedFragmentB = typename Operator::FragmentB; + using WarpTransformedFragmentA = typename Operator::TransformedFragmentA; + using WarpTransformedFragmentB = typename Operator::TransformedFragmentB; + + private: + // + // Data members + // + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + + bool prologue_done_; + + // Set to `True` to ensure the accumulator will be zero outside the GEMM + // footprint + bool zero_outside_bounds_; + + public: + /// Construct from tensor references + CUTLASS_DEVICE + CustomMmaMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorageA& shared_storageA, + typename Base::SharedStorageB& shared_storageB, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storageA.ref(), thread_idx), + smem_iterator_B_(shared_storageB.ref(), thread_idx), + prologue_done_(false), + zero_outside_bounds_(false) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + CUTLASS_DEVICE + CustomMmaMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage& st, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : CustomMmaMultistage( + st.operand_A, + st.operand_B, + thread_idx, + warp_idx, + lane_idx) {} + + CUTLASS_DEVICE + void set_prologue_done(bool value) { + prologue_done_ = value; + } + + CUTLASS_DEVICE + void set_zero_outside_bounds(bool value) { + zero_outside_bounds_ = value; + } + + template + CUTLASS_DEVICE static void prologue( + typename Base::SharedStorage& shared_storage, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + int thread_idx, + int problem_size_k) { + prologue( + shared_storage.operand_A, + shared_storage.operand_B, + iterator_A, + iterator_B, + thread_idx, + problem_size_k); + } + + template + CUTLASS_DEVICE static void prologue( + typename Base::SharedStorageA& shared_storageA, + typename Base::SharedStorageB& shared_storageB, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + int thread_idx, + int problem_size_k) { + SmemIteratorA smem_iterator_A(shared_storageA.ref(), thread_idx); + SmemIteratorB smem_iterator_B(shared_storageB.ref(), thread_idx); + int32_t iter = (problem_size_k + Base::Shape::kK - 1) / Base::Shape::kK; + _prologue( + iterator_A, iterator_B, iter, smem_iterator_A, smem_iterator_B); + } + + CUTLASS_DEVICE + void copy_tiles_and_advance( + IteratorA& iterator_A, + IteratorB& iterator_B, + int group_start_A = 0, + int group_start_B = 0) { + iterator_A.set_iteration_index( + group_start_A * IteratorA::kAccessesPerVector); + this->smem_iterator_A_.set_iteration_index(group_start_A); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) { + if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) { + typename IteratorA::AccessType* dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_A.get(); + + if (zero_outside_bounds_ || + SharedMemoryClear == SharedMemoryClearOption::kZfill) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, gmem_ptr, iterator_A.valid()); + } else { + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_A.valid()); + } + + ++iterator_A; + } + + ++this->smem_iterator_A_; + } + } + + iterator_B.set_iteration_index( + group_start_B * IteratorB::kAccessesPerVector); + this->smem_iterator_B_.set_iteration_index(group_start_B); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) { + if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) { + typename IteratorB::AccessType* dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_B.get(); + + if (zero_outside_bounds_ || + SharedMemoryClear == SharedMemoryClearOption::kZfill) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, gmem_ptr, iterator_B.valid()); + } else { + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_B.valid()); + } + + ++iterator_B; + } + ++this->smem_iterator_B_; + } + } + } + + template + CUTLASS_DEVICE static void _prologue( + IteratorA& iterator_A, + IteratorB& iterator_B, + int32_t& gemm_k_iterations, + SmemIteratorA& smem_iterator_A_, + SmemIteratorB& smem_iterator_B_) { + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < kNumStagesConcurrentLoad; + ++stage, --gemm_k_iterations) { + iterator_A.clear_mask(gemm_k_iterations == 0); + iterator_B.clear_mask(gemm_k_iterations == 0); + + iterator_A.set_iteration_index(0); + smem_iterator_A_.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) { + typename IteratorA::AccessType* dst_ptr = + reinterpret_cast( + smem_iterator_A_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + int src_bytes = (iterator_A.valid() ? kSrcBytes : 0); + + if (kLoadA) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_A.get(), iterator_A.valid()); + } + + ++iterator_A; + } + + ++smem_iterator_A_; + } + + iterator_B.set_iteration_index(0); + smem_iterator_B_.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) { + typename IteratorB::AccessType* dst_ptr = + reinterpret_cast( + smem_iterator_B_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + if (kLoadB) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_B.get(), iterator_B.valid()); + } + + ++iterator_B; + } + + ++smem_iterator_B_; + } + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + smem_iterator_A_.add_tile_offset({0, 1}); + smem_iterator_B_.add_tile_offset({1, 0}); + + // Defines the boundary of a stage of cp.async. + cutlass::arch::cp_async_fence(); + } + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations, + ///< destination accumulator tile + FragmentC& accum, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + ///< initial value of accumulator + FragmentC const& src_accum) { + // + // Prologue + // + + if (!prologue_done_) { + _prologue( + iterator_A, + iterator_B, + gemm_k_iterations, + smem_iterator_A_, + smem_iterator_B_); + } else if (!kSmemContainsEntireMat) { + _prologue( + iterator_A, + iterator_B, + gemm_k_iterations, + smem_iterator_A_, + smem_iterator_B_); + } else { + gemm_k_iterations -= kNumStagesConcurrentLoad; + } + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + // + // Clear the remaining tiles of SMEM. This is a functional requirement for + // some kernels so that all accumulator elements outside the GEMM footprint + // are zero. + // + + if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) { + /// Iterator to write threadblock-scoped tile of A operand to shared + /// memory + SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_); + + typename IteratorA::AccessType zero_A; + zero_A.clear(); + + last_smem_iterator_A.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) { + typename IteratorA::AccessType* dst_ptr = + reinterpret_cast( + last_smem_iterator_A.get()); + + *dst_ptr = zero_A; + + ++last_smem_iterator_A; + } + + /// Iterator to write threadblock-scoped tile of B operand to shared + /// memory + SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_); + typename IteratorB::AccessType zero_B; + + zero_B.clear(); + last_smem_iterator_B.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) { + typename IteratorB::AccessType* dst_ptr = + reinterpret_cast( + last_smem_iterator_B.get()); + + *dst_ptr = zero_B; + + ++last_smem_iterator_B; + } + } + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA warp_loaded_frag_A[2]; + WarpLoadedFragmentB warp_loaded_frag_B[2]; + WarpTransformedFragmentA warp_transformed_frag_A[2]; + WarpTransformedFragmentB warp_transformed_frag_B[2]; + + Operator warp_mma; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + iterator_A.clear_mask(gemm_k_iterations == 0); + iterator_B.clear_mask(gemm_k_iterations == 0); + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + warp_mma.transform( + warp_transformed_frag_A[0], + warp_transformed_frag_B[0], + warp_loaded_frag_A[0], + warp_loaded_frag_B[0]); + + // tf32x3 kernels use staging accumulation. warp_mma uses a temporary + // accumulator and this temporary accumulator is added to the final + // accumulator once in every mainloop iteration. + plus plus_accum; + + FragmentC tmp_accum; + + if (platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + tmp_accum.clear(); + } + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > (-kNumStagesConcurrentLoad);) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_A_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + + // In case of a non-circular buffer ("kSmemContainsEntireMat") + // make sure we don't load out of bounds data. + if (!kSmemContainsEntireMat || + gemm_k_iterations > (-kNumStagesConcurrentLoad) || + warp_mma_k < Base::kWarpGemmIterations - 1) { + this->warp_tile_iterator_A_.load( + warp_loaded_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load( + warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + } + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k > 0) + warp_mma.transform( + warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + warp_loaded_frag_A[warp_mma_k % 2], + warp_loaded_frag_B[warp_mma_k % 2]); + + if (platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + warp_mma( + tmp_accum, + warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + tmp_accum); + + if (warp_mma_k == 0) { + accum = plus_accum(accum, tmp_accum); + tmp_accum.clear(); + } + } else { + warp_mma( + accum, + warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + accum); + } + + // Issue global->shared copies for the this stage + if (!kSmemContainsEntireMat && + warp_mma_k < Base::kWarpGemmIterations - 1) { + int group_start_iteration_A, group_start_iteration_B; + + group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA; + group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance( + iterator_A, + iterator_B, + group_start_iteration_A, + group_start_iteration_B); + } + + if (warp_mma_k + 2 == Base::kWarpGemmIterations) { + if (!kSmemContainsEntireMat) { + int group_start_iteration_A, group_start_iteration_B; + group_start_iteration_A = + (warp_mma_k + 1) * Detail::kAccessesPerGroupA; + group_start_iteration_B = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance( + iterator_A, + iterator_B, + group_start_iteration_A, + group_start_iteration_B); + } + + // Inserts a memory fence between stages of cp.async instructions. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (!kSmemContainsEntireMat && + smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_A_.add_tile_offset( + {0, + -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + --gemm_k_iterations; + iterator_A.clear_mask(gemm_k_iterations == 0); + iterator_B.clear_mask(gemm_k_iterations == 0); + } + + // Do any conversions feeding the first stage at the end of the loop so + // we can start right away on mma instructions + if (warp_mma_k + 1 == Base::kWarpGemmIterations) + warp_mma.transform( + warp_transformed_frag_A[(warp_mma_k + 1) % 2], + warp_transformed_frag_B[(warp_mma_k + 1) % 2], + warp_loaded_frag_A[(warp_mma_k + 1) % 2], + warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + } + } + + if (platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + accum = plus_accum(accum, tmp_accum); + } + + if (SharedMemoryClear == SharedMemoryClearOption::kZfill) { + // commit and drain all pending and predicated cp.async pnz from the GEMM + // mainloop + cutlass::arch::cp_async_fence(); + cutlass::arch::cp_async_wait<0>(); + __syncthreads(); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h new file mode 100644 index 0000000000000000000000000000000000000000..a36d7cdb58242e9e43b8050cf9068ee11024aed8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h @@ -0,0 +1,407 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include + +#include + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Transformation applied to A operand + typename TransformA_ = NumericArrayConverter< + typename SmemIteratorA_::Element, + typename IteratorA_::Element, + IteratorA_::Fragment::kElements>, + /// + /// Transformation applied to B operand + typename TransformB_ = NumericArrayConverter< + typename SmemIteratorB_::Element, + typename IteratorB_::Element, + IteratorB_::Fragment::kElements>, + /// Used for partial specialization + typename Enable = bool> +class CustomMmaPipelined : public CustomMmaBase { + public: + ///< Base class + using Base = CustomMmaBase; + + using Shape = + Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using IteratorA = + IteratorA_; ///< Iterates over tiles of A operand in global memory + using IteratorB = + IteratorB_; ///< Iterates over tiles of B operand in global memory + using ElementC = ElementC_; ///< Data type of accumulator matrix + using LayoutC = LayoutC_; ///< Layout of accumulator matrix + using Policy = Policy_; ///< Policy describing tuning details + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + using TransformA = TransformA_; + using TransformB = TransformB_; + + // + // Dependent types + // + + /// Fragment of operand A loaded from global memory + using FragmentA = typename IteratorA::Fragment; + + /// Fragment of operand B loaded from global memory + using FragmentB = typename IteratorB::Fragment; + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Obtain the arch tag from the warp-level operator + using ArchTag = typename Policy::Operator::ArchTag; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = Operator::kTransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + + // statically assert kStages for MmaPipelined is two (Double-buffered pipeline) + static_assert( + (Base::kStages == 2), + "MmaPipelined requires kStages set to value 2"); + + static bool const kSmemContainsEntireMat = false; + + private: + using WarpFragmentA = typename Operator::FragmentA; + using WarpFragmentB = typename Operator::FragmentB; + + protected: + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + + public: + /// Construct from tensor references + CUTLASS_DEVICE + CustomMmaPipelined( + typename Base::SharedStorageA& shared_storageA, + typename Base::SharedStorageB& shared_storageB, + int thread_idx, ///< ID within the threadblock + int warp_idx, ///< ID of warp + int lane_idx ///< ID of each thread within a warp + ) + : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storageA.ref(), thread_idx), + smem_iterator_B_(shared_storageB.ref(), thread_idx) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + CUTLASS_DEVICE + CustomMmaPipelined( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage& st, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : CustomMmaPipelined( + st.operand_A, + st.operand_B, + thread_idx, + warp_idx, + lane_idx) {} + + CUTLASS_DEVICE + void set_prologue_done(bool value) { + // NOT IMPLEMENTED FOR PIPELINED + } + + CUTLASS_DEVICE + void set_zero_outside_bounds(bool value) { + // NOT NEEDED FOR PIPELINED + // shared memory will always be zero-filled + } + + template + CUTLASS_DEVICE static void prologue( + typename Base::SharedStorage& shared_storage, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + int thread_idx, + int problem_size_k) { + prologue( + shared_storage.operand_A, + shared_storage.operand_B, + iterator_A, + iterator_B, + thread_idx, + problem_size_k); + } + + template + CUTLASS_DEVICE static void prologue( + typename Base::SharedStorageA& shared_storageA, + typename Base::SharedStorageB& shared_storageB, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + int thread_idx, + int problem_size_k) { + // NOT IMPLEMENTED FOR PIPELINED + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + int gemm_k_iterations, ///< number of iterations of the mainloop + FragmentC& accum, ///< destination accumulator tile + IteratorA iterator_A, ///< iterator over A operand in global memory + IteratorB iterator_B, ///< iterator over B operand in global memory + FragmentC const& src_accum, ///< source accumulator tile + TransformA transform_A = + TransformA(), ///< transformation applied to A fragment + TransformB transform_B = + TransformB()) { ///< transformation applied to B fragment + + // + // Prologue + // + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + FragmentA tb_frag_A; + FragmentB tb_frag_B; + + tb_frag_A.clear(); + tb_frag_B.clear(); + + // The last kblock is loaded in the prolog + iterator_A.load(tb_frag_A); + iterator_B.load(tb_frag_B); + + ++iterator_A; + ++iterator_B; + + this->smem_iterator_A_.store(transform_A(tb_frag_A)); + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B_; + + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpFragmentA warp_frag_A[2]; + WarpFragmentB warp_frag_B[2]; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_frag_A[0]); + this->warp_tile_iterator_B_.load(warp_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + Operator warp_mma; + + int smem_write_stage_idx = 1; + + // Avoid reading out of bounds + iterator_A.clear_mask(gemm_k_iterations <= 1); + iterator_B.clear_mask(gemm_k_iterations <= 1); + + // Issue loads during the first warp-level matrix multiply-add *AFTER* + // issuing shared memory loads (which have the tighest latency requirement). + + // + // Mainloop + // + + // Note: The main loop does not support Base::kWarpGemmIterations == 2. + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > 0; --gemm_k_iterations) { + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + if (warp_mma_k == Base::kWarpGemmIterations - 1) { + // Write fragments to shared memory + this->smem_iterator_A_.store(transform_A(tb_frag_A)); + + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + + __syncthreads(); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B_; + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == 1) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + } else { + this->warp_tile_iterator_A_.add_tile_offset( + {0, + -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + } + + smem_write_stage_idx ^= 1; + } + + this->warp_tile_iterator_A_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k == 0) { + iterator_A.load(tb_frag_A); + iterator_B.load(tb_frag_B); + + ++iterator_A; + ++iterator_B; + + // Avoid reading out of bounds if this was the last loop iteration + iterator_A.clear_mask(gemm_k_iterations <= 2); + iterator_B.clear_mask(gemm_k_iterations <= 2); + } + + warp_mma( + accum, + warp_frag_A[warp_mma_k % 2], + warp_frag_B[warp_mma_k % 2], + accum); + } + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h new file mode 100644 index 0000000000000000000000000000000000000000..e7408deec401b691837bf4f063ae71da79093f11 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h @@ -0,0 +1,172 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +/*! \file + \brief Cutlass provides helper template functions to figure out the right + data structures to instantiate to run a GEMM with various parameters (see + `cutlass/gemm/threadblock/default_mma.h`). However, due to template + instantiation priority rules, it will only create an MmaMultiStage with + kStages=3 (otherwise creates an MmePipelined - which is not compatible with + FastF32). kStages=3 uses too much shared memory and we want to use kStages=2, + so we just copy-pasted some code from `default_mma.h` and + `default_mma_core.h` files and wrapped this template to allow our use case. + + This is really only for the FastF32 case - aka using TensorCores with fp32. +*/ + +#pragma once + +#include +#include +#include +#include +#include + +namespace cutlass { +namespace gemm { +namespace threadblock { + +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Layout type for C and D matrix operand + typename LayoutC, + /// Operator class tag + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation performed by GEMM + typename Operator, + typename Enable_ = void> +struct FindDefaultMma { + static constexpr bool AccumulatorsInRowMajor = false; + static constexpr SharedMemoryClearOption SharedMemoryClear = + SharedMemoryClearOption::kNone; + using DefaultMma = cutlass::gemm::threadblock::DefaultMma< + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementAccumulator, + LayoutC, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + Stages, + Operator, + AccumulatorsInRowMajor, + SharedMemoryClear>; +}; + +/// Specialization for sm80 / FastF32 / multistage with kStages=2 +template < + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + int kStages, + typename Operator> +struct FindDefaultMma< + ElementA_, + LayoutA_, + kAlignmentA, + ElementB_, + LayoutB_, + kAlignmentB, + ElementAccumulator, + layout::RowMajor, + arch::OpClassTensorOp, + arch::Sm80, + ThreadblockShape, + WarpShape, + InstructionShape, + kStages, + Operator, + typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> { + using LayoutC = layout::RowMajor; + using OperatorClass = arch::OpClassTensorOp; + using ArchTag = arch::Sm80; + + using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma< + ElementA_, + LayoutA_, + kAlignmentA, + ElementB_, + LayoutB_, + kAlignmentB, + ElementAccumulator, + LayoutC, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + 3, + Operator>; + struct DefaultMma : DefaultMma_ { + using MmaCore_ = typename DefaultMma_::MmaCore; + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore_::Shape, + typename DefaultMma_::IteratorA, + typename MmaCore_::SmemIteratorA, + MmaCore_::kCacheOpA, + typename DefaultMma_::IteratorB, + typename MmaCore_::SmemIteratorB, + MmaCore_::kCacheOpB, + ElementAccumulator, + LayoutC, + typename MmaCore_::MmaPolicy, + kStages>; + }; +}; + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_accum_lambda_iterator.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_accum_lambda_iterator.h new file mode 100644 index 0000000000000000000000000000000000000000..67502a7dd12dfbd8074aff0482d462f8e2272b76 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_accum_lambda_iterator.h @@ -0,0 +1,359 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include +#include +#include +#include + +/* +TensorCores have different accumulator layouts. +This file provides a class to easily map the accumulator +i-th element with the corresponding matrix row/col. +*/ + +template +struct AccumLambdaIteratorSm80 { + static_assert( + cutlass::platform:: + is_same::value, + "only RowMajor is supported"); + + using Policy = typename T::Policy; + using InstructionShape = typename T::InstructionShape; + using OpDelta = typename T::OpDelta; + using Shape = typename T::Shape; + static int const kElementsPerAccess = InstructionShape::kN / 4; + static int const kRowsPerTile = 8; + static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile; + + static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset( + int8_t lane_id, + int8_t warp_id, + typename T::TensorCoord const& tile_offset) { + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + return cutlass::MatrixCoord( + quad + tile_offset.row() * Shape::kRow, + lane_in_quad * kElementsPerAccess + + tile_offset.column() * Shape::kColumn); + } + + template + CUTLASS_DEVICE static void iterateRows( + cutlass::MatrixCoord& lane_offset, + FA beginRow, + FB op, + FC endRow) { + // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile + lane_offset.row(); + beginRow(accum_m); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + + col + lane_offset.column(); + int idx = mma_accum_start + row * kElementsPerAccess + col; + op(accum_m, accum_n, idx); + } + } + + endRow(accum_m); + } + } + } + + template + CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) { + // In each warp, 4 threads will work on the same row + // - the ones with the same `quad` + auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1); + myValue = fn(myValue, otherV); + otherV = __shfl_xor_sync(0xffffffff, myValue, 2); + myValue = fn(myValue, otherV); + int lane_in_quad = (lane_id & 3); + return lane_in_quad == 0; + } +}; + +template +struct AccumLambdaIteratorSm70 { + static_assert( + cutlass::platform:: + is_same::value, + "only RowMajor is supported"); + + using Policy = typename T::Policy; + using InstructionShape = typename T::InstructionShape; + using OpDelta = typename T::OpDelta; + using Shape = typename T::Shape; + using Element = accum_t; + + static int const kElementsPerPartial = 4; + using EleShapePerPatial = typename cutlass::platform::conditional< + cutlass::platform::is_same::value, + cutlass::MatrixShape<2, 2>, + cutlass::MatrixShape<1, 4>>::type; + static int const kElementsPerMma = 8; + static int const kAccumulatorPatials = 2; + using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>; + + static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset( + int8_t lane_id, + int8_t warp_id, + typename T::TensorCoord const& tile_offset) { + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + int accum_m, accum_n; + + if (cutlass::platform::is_same::value) { + // (quad[2],quad[0])+lane_in_quad[0] + accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1); + // (quad[1])+lane_in_quad[1] + accum_n = + ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials + + (lane_in_quad & 2); + } else { + accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + + lane_in_quad; // (quad[2],quad[0]) + accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials; + } + return cutlass::MatrixCoord( + accum_m + tile_offset.row() * Shape::kRow, + accum_n + tile_offset.column() * Shape::kColumn); + } + + template + CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) { + static_assert( + cutlass::platform::is_same::value, + "update to support non-float accum"); + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16 + // T0 & T2 share same line within a quad + auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 1); + myValue = fn(myValue, otherV); + // quad 0 and quad 2 are on the same lines + otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 3); + myValue = fn(myValue, otherV); + return (lane_id & ((1 << 1) | (1 << 3))) == 0; + } + + template + CUTLASS_DEVICE static void iterateRows( + cutlass::MatrixCoord& lane_offset, + FA beginRow, + FB op, + FC endRow) { + CUTLASS_PRAGMA_UNROLL + for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < EleShapePerPatial::kRow; ++m) { + int accum_m = tile_m * Policy::InterleavedTile::kRow + + mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row(); + beginRow(accum_m); + + CUTLASS_PRAGMA_UNROLL + for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; + ++tile_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; + ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int p = 0; p < kAccumulatorPatials; ++p) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < EleShapePerPatial::kColumn; ++n) { + int mma_accum_start = + (((tile_n * Policy::TileIterations::kRow + tile_m) * + Policy::MmaIterations::kColumn + + mma_n) * + Policy::MmaIterations::kRow + + mma_m) * + kElementsPerMma; + int accum_n = tile_n * Policy::InterleavedTile::kColumn + + mma_n * QuadShapePerPatialMma::kColumn + + p * Policy::InterleavedTile::kColumn / 2 + n + + lane_offset.column(); + int idx = mma_accum_start + p * kElementsPerPartial + + m * EleShapePerPatial::kColumn + n; + op(accum_m, accum_n, idx); + } + } + } + } + endRow(accum_m); + } + } + } + } +}; + +template +struct AccumLambdaIteratorSimt { + using Policy = typename T::Policy; + using Iterations = typename T::Iterations; + using Element = typename T::Element; + using Delta = typename T::Delta; + using Shape = typename T::Shape; + static_assert( + cutlass::platform:: + is_same::value, + "only RowMajor is supported"); + + template + CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) { + CUTLASS_PRAGMA_UNROLL + for (int bit = 1; bit < Policy::WarpShape::kColumn; bit *= 2) { + auto otherV = __shfl_xor_sync(0xffffffff, myValue, bit); + myValue = fn(myValue, otherV); + } + return (lane_id & (Policy::WarpShape::kColumn - 1)) == 0; + } + + template + CUTLASS_DEVICE static void iterateRows( + cutlass::MatrixCoord& lane_offset, + FA beginRow, + FB op, + FC endRow) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) { + int accum_m = mma_m * Delta::kRow + m + lane_offset.row(); + beginRow(accum_m); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) { + int accum_n = + mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN + + lane_offset.column(); + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) { + int idx = n + + Policy::LaneMmaShape::kN * + (mma_n + + Iterations::kColumn * + (m + mma_m * Policy::LaneMmaShape::kM)); + op(accum_m, accum_n + n, idx); + } + } + endRow(accum_m); + } + } + } + + static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset( + int8_t lane_id, + int8_t warp_id, + typename T::TensorCoord const& tile_offset) { + static_assert( + cutlass::platform::is_same< + typename Policy::LaneLayout, + cutlass::layout::RowMajorInterleaved<1>>::value, + ""); + typename Policy::LaneLayout lane_layout = Policy::get_lane_layout(); + + cutlass::MatrixCoord lane_offset = lane_layout.inverse(lane_id) * + cutlass::MatrixCoord(Policy::LaneMmaShape::kM, + Policy::LaneMmaShape::kN); + return lane_offset + + tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn); + } +}; + +template +struct DefaultMmaAccumLambdaIterator; + +// Simt +template +struct DefaultMmaAccumLambdaIterator< + cutlass::gemm::warp::MmaSimtTileIterator< + S, + cutlass::gemm::Operand::kC, + accum_t, + cutlass::layout::RowMajor, + P, + 1, + 1>, + accum_t, + kWarpSize> { + using WarpIterator = typename cutlass::gemm::warp::MmaSimtTileIterator< + S, + cutlass::gemm::Operand::kC, + accum_t, + cutlass::layout::RowMajor, + P, + 1, + 1>; + using Iterator = AccumLambdaIteratorSimt; +}; + +// TensorOp - Volta +template +struct DefaultMmaAccumLambdaIterator< + cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator< + S1, + accum_t, + cutlass::layout::RowMajor, + S2, + cutlass::MatrixShape<1, 1>>, + accum_t, + kWarpSize> { + using WarpIterator = + typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator< + S1, + accum_t, + cutlass::layout::RowMajor, + S2, + cutlass::MatrixShape<1, 1>>; + using Iterator = AccumLambdaIteratorSm70; +}; + +// TensorOp - Sm75+ +template < + typename S1, + typename S2, + typename S3, + typename accum_t, + int kWarpSize> +struct DefaultMmaAccumLambdaIterator< + cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< + S1, + accum_t, + cutlass::layout::RowMajor, + S2, + S3>, + accum_t, + kWarpSize> { + using WarpIterator = + typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< + S1, + accum_t, + cutlass::layout::RowMajor, + S2, + S3>; + using Iterator = AccumLambdaIteratorSm80; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h new file mode 100644 index 0000000000000000000000000000000000000000..294d37d510f4339fb4bb794a203cc148a6402aa6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h @@ -0,0 +1,1953 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +/// Shared storage object needed by accumulator +/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h +template < + typename Shape_, + typename Element_, + typename Layout_, + typename Padding_> +class AccumulatorSharedStorage { + public: + // + // Type definitions + // + using Shape = Shape_; + using Element = Element_; + using Layout = Layout_; + using Padding = Padding_; + + /// Tensor reference to the accumulator + using TensorRefAccum = cutlass::TensorRef; + + /// Shape of the accumulator matrix in shared memory + using ShapeAccum = cutlass:: + MatrixShape; + + public: + // + // Data members + // + + /// Buffer for accumulator + cutlass::AlignedBuffer accum; + + public: + // + // Methods + // + + /// Returns a layout object for the Accum matrix + CUTLASS_DEVICE + static Layout LayoutAccum() { + return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn}); + } + + /// Returns a TensorRef to the Accumulator + CUTLASS_HOST_DEVICE + TensorRefAccum accum_ref() { + return TensorRefAccum{accum.data(), LayoutAccum()}; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Taken from +// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + // Maximum K dimension - also the dimension of the shared-memory + // holding `OperandA` + int kMaxK_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Layout in shared-memory of operand A + typename SmemLayoutA, + /// Used for partial specialization + typename Enable = bool> +class MmaBaseFromSharedMemory { + public: + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + static constexpr int kMaxK = kMaxK_; + + ///< Policy describing tuning details + using Policy = Policy_; + + // + // Dependent types + // + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Shape describing the overall GEMM computed from shared memory + /// by each warp. + using WarpGemm = typename Policy::Operator::Shape; + + /// Shape describing the number of warps filling the CTA + using WarpCount = GemmShape< + Shape::kM / WarpGemm::kM, + Shape::kN / WarpGemm::kN, + Shape::kK / WarpGemm::kK>; + using WarpCount1 = WarpCount; + + /// Number of warp-level GEMM operations + static int const kWarpGemmIterations = + (WarpGemm::kK / Operator::Policy::MmaShape::kK); + static int const kWarpGemmIterations1 = kWarpGemmIterations; + + /// Number of stages + static int const kStages = Stages; + + /// If this is true, we fill the entire shmem buffer at start + /// and don't need to iterate through it in a circular fashion + static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages; + + /// Tensor reference to the A operand + using TensorRefA = TensorRef; + + /// Tensor reference to the B operand + using TensorRefB = + TensorRef; + + // + // Nested structs + // + + /// Shared storage object needed by threadblock-scoped GEMM + class SharedStorage { + public: + // + // Type definitions + // + + /// Shape of the B matrix operand in shared memory + using ShapeB = MatrixShape< + Shape::kK * kStages + Policy::SmemPaddingB::kRow, + Shape::kN + Policy::SmemPaddingB::kColumn>; + + public: + // + // Data members + // + + /// Buffer for B operand + AlignedBuffer operand_B; + + public: + // + // Methods + // + + /// Returns a layout object for the B matrix + CUTLASS_HOST_DEVICE + static typename Operator::LayoutB LayoutB() { + return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn}); + } + + /// Returns a TensorRef to the B operand + CUTLASS_HOST_DEVICE + TensorRefB operand_B_ref() { + return TensorRefB{operand_B.data(), LayoutB()}; + } + }; + + protected: + // + // Data members + // + + // /// Iterator to load a warp-scoped tile of A operand from shared memory + // typename Operator::IteratorA warp_tile_iterator_A_; + + /// Iterator to load a warp-scoped tile of B operand from shared memory + typename Operator::IteratorB warp_tile_iterator_B_; + + public: + /// Construct from tensor references + CUTLASS_DEVICE + MmaBaseFromSharedMemory( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + TensorRefB& b_tile, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : warp_tile_iterator_B_(b_tile, lane_idx) {} +}; + +namespace { + +// has necessary trait compliance with WarpIteratorFromSmem but doesn't do +// anything, can be default initialized, and uses fragment that takes up +// (almost) no space. this warp iterator is selected at compile time when +// elementwise on-the-fly scaling for operand A is disabled, in which case +// operations related to loading scale factors for operand A get wiped out by +// the compiler. +template +class NoOpWarpIteratorScale { + public: + // in pipelined+multistage MMA implementations we keep an array of fragments. + // if we aren't using scaling we don't want to waste registers on fragments + // of scale elements, so ideally this would be sized 0. + // Since arrays of zero-sized objects are not allowed, using size as 1. + // The compiler will most likely wipe it out anyways. + using Fragment = cutlass::Array; + + CUTLASS_HOST_DEVICE + NoOpWarpIteratorScale() {} + + CUTLASS_HOST_DEVICE + NoOpWarpIteratorScale(TensorRef const&, int) {} + + CUTLASS_HOST_DEVICE + NoOpWarpIteratorScale& add_tile_offset( + typename TensorRef::TensorCoord const&) { + return *this; + } + + CUTLASS_HOST_DEVICE + NoOpWarpIteratorScale& operator++() { + return *this; + } + + CUTLASS_DEVICE + void load(Fragment&) const {} +}; + +// if scaling is enabled, performs fragment elementwise multiplication between +// fragment and its scaling factor. +template +class FragmentElementwiseScaler; + +// specialization for scaling being enabled. +template +class FragmentElementwiseScaler { + public: + // cast scale_frag to correct type then apply elementwise to fragment + CUTLASS_DEVICE + static Fragment apply(Fragment frag, FragmentScale const& scale_frag) { + Fragment converted_scale_frag = cutlass::NumericArrayConverter< + typename Fragment::Element, + typename FragmentScale::Element, + FragmentScale::kElements>()(scale_frag); + return cutlass::multiplies()(frag, converted_scale_frag); + } +}; + +// specialization for scaling being disabled. doesn't do anything and should +// just get wiped out by the compiler. +template +class FragmentElementwiseScaler { + public: + CUTLASS_DEVICE + static Fragment apply(Fragment frag, FragmentScale const&) { + return frag; + } +}; +} // namespace + +//////////////////////////////////////////////////////////////////////////////// +// Taken from +// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + // BEGIN smem + /// Iterates over the intermediate accumulator tile in shared memory + typename WarpIteratorA_, + /// whether or not to perform elementwise multiplication of A + // by another matrix (A_scale) that is also kept in shared memory prior + // to matmul A @ B + bool ScaleOperandA_, + /// Max GEMM problem size in K dimension + int MaxK, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Transformation applied to B operand + typename TransformB_ = NumericArrayConverter< + typename SmemIteratorB_::Element, + typename IteratorB_::Element, + IteratorB_::Fragment::kElements>, + /// Used for partial specialization + typename Enable = bool> +class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory< + Shape_, + MaxK, + Policy_, + 2, + typename WarpIteratorA_::Layout> { + public: + ///< Base class + using Base = MmaBaseFromSharedMemory< + Shape_, + MaxK, + Policy_, + 2, + typename WarpIteratorA_::Layout>; + + using Shape = + Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + static constexpr bool ScaleOperandA = ScaleOperandA_; + + using WarpIteratorA = WarpIteratorA_; + ///< loads fragments of A_scale from shared memory if operand A scaling is + ///< enabled. otherwise no-op. + using WarpIteratorAScale = typename cutlass::platform::conditional< + ScaleOperandA, + WarpIteratorA, + NoOpWarpIteratorScale>::type; + + using IteratorB = + IteratorB_; ///< Iterates over tiles of B operand in global memory + using ElementC = ElementC_; ///< Data type of accumulator matrix + using LayoutC = LayoutC_; ///< Layout of accumulator matrix + using Policy = Policy_; ///< Policy describing tuning details + + using SmemIteratorB = SmemIteratorB_; + + using TransformB = TransformB_; + + // + // Dependent types + // + + /// Fragment of operand B loaded from global memory + using FragmentB = typename IteratorB::Fragment; + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Obtain the arch tag from the warp-level operator + using ArchTag = typename Policy::Operator::ArchTag; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + + // statically assert kStages for MmaPipelined is two (Double-buffered pipeline) + static_assert( + (Base::kStages == 2), + "MmaPipelined requires kStages set to value 2"); + + private: + using WarpFragmentA = typename Operator::FragmentA; + + /// fragment type of OperandA elementwise scaling matrix. (almost) empty + /// if operand A scaling is disabled. + using WarpFragmentAScale = typename WarpIteratorAScale::Fragment; + + using WarpFragmentB = typename Operator::FragmentB; + + /// applies scaling factor to operand A fragment if operand A scaling is + /// enabled. otherwise no-op. + using FragmentAScaler = FragmentElementwiseScaler< + WarpFragmentA, + WarpFragmentAScale, + ScaleOperandA>; + + protected: + // /// Iterator to write threadblock-scoped tile of A operand to shared memory + // SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + + /// Iterator to load a warp-scoped tile of A operand from intermediate + /// accumulator tile + WarpIteratorA warp_tile_iterator_A_; + + /// Iterator to load a warp-scoped tile of A_scale from intermediate + /// accumulator tile (only used if ScaleOperandA_ is true) + WarpIteratorAScale warp_tile_iterator_A_scale_; + + public: + /// constructor for MMA with operand A scaling enabled. + CUTLASS_DEVICE + MmaPipelinedFromSharedMemory( + typename Base::TensorRefA a, // Operand A in shared memory + typename Base::TensorRefA a_scale, // Operand A_scale in shared memory + typename Base::TensorRefB + b_staging, // staging memory for loading tiles of B + int thread_idx, + int warp_idx, + int lane_idx) + : Base(b_staging, thread_idx, warp_idx, lane_idx), + warp_tile_iterator_A_(a, lane_idx), + warp_tile_iterator_A_scale_(a_scale, lane_idx), + smem_iterator_B_(b_staging, thread_idx) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_A_scale_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + + /// Construct from tensor references + CUTLASS_DEVICE + MmaPipelinedFromSharedMemory( + typename Base::TensorRefA a, ///< Operand A in shared memory + typename Base::TensorRefB b_staging, ///< staging memory for loading B + int thread_idx, ///< ID within the threadblock + int warp_idx, ///< ID of warp + int lane_idx) ///< ID of each thread within a warp + : Base(b_staging, thread_idx, warp_idx, lane_idx), + warp_tile_iterator_A_(a, lane_idx), + smem_iterator_B_(b_staging, thread_idx) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + + // For API compatibility with MmaMultistageFromSharedMemory + // but not supported as it worsens perf: older gpus < sm80 don't + // support async transfers and have to waste registers + CUTLASS_DEVICE + void set_prologue_done(bool value) {} + CUTLASS_DEVICE + static void prologue( + typename Base::SharedStorage& shared_storage, + IteratorB iterator_B1, + int thread_idx, + int problem_size_0_n) {} + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + int gemm_k_iterations, ///< number of iterations of the mainloop + FragmentC& accum, ///< destination accumulator tile + // IteratorA iterator_A, ///< iterator over A + // operand in global memory + IteratorB iterator_B, ///< iterator over B operand in global memory + FragmentC const& src_accum, ///< source accumulator tile + // TransformA transform_A = TransformA(), ///< transformation + // applied to A fragment + TransformB transform_B = + TransformB()) { ///< transformation applied to B fragment + + // + // Prologue + // + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + FragmentB tb_frag_B; + + tb_frag_B.clear(); + + // The last kblock is loaded in the prolog + iterator_B.set_residual_tile(gemm_k_iterations == 1); + iterator_B.load(tb_frag_B); + + ++iterator_B; + + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + + ++this->smem_iterator_B_; + + __syncthreads(); + + // remember that WarpFragmentAScale and WarpIteratorAScale are empty/no-op + // if scaling is disabled. + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpFragmentA warp_frag_A[2]; + WarpFragmentAScale warp_frag_A_scale[2]; + WarpFragmentB warp_frag_B[2]; + warp_frag_A[0].clear(); + warp_frag_A_scale[0].clear(); + warp_frag_B[0].clear(); + + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_frag_A[0]); + this->warp_tile_iterator_A_scale_.load(warp_frag_A_scale[0]); + this->warp_tile_iterator_B_.load(warp_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_A_scale_; + ++this->warp_tile_iterator_B_; + + Operator warp_mma; + + int smem_write_stage_idx = 1; + + // Avoid reading out of bounds + iterator_B.set_residual_tile(gemm_k_iterations == 2); + iterator_B.clear_mask(gemm_k_iterations <= 1); + + // Issue loads during the first warp-level matrix multiply-add *AFTER* + // issuing shared memory loads (which have the tightest latency + // requirement). + + // + // Mainloop + // + + // Note: The main loop does not support Base::kWarpGemmIterations == 2. + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > 0; --gemm_k_iterations) { + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + bool hasNext = true; + + if (warp_mma_k == Base::kWarpGemmIterations - 1) { + if (gemm_k_iterations > 1) { + // Write fragments to shared memory + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + } + + __syncthreads(); + + ++this->smem_iterator_B_; + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory SMEM: Don't reset iterator A, as + // we are continuing our iteration at this point + if (smem_write_stage_idx == 1) { + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + } else { + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + } + + smem_write_stage_idx ^= 1; + hasNext = gemm_k_iterations > 1; + } + + // Only read the next if we need to + if (hasNext) { + this->warp_tile_iterator_B_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_A_scale_.load( + warp_frag_A_scale[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_A_scale_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k == 0) { + iterator_B.load(tb_frag_B); + + ++iterator_B; + + // Avoid reading out of bounds if this was the last loop iteration + iterator_B.set_residual_tile(gemm_k_iterations == 3); + iterator_B.clear_mask(gemm_k_iterations <= 2); + } + } + + warp_mma( + accum, + FragmentAScaler::apply( + warp_frag_A[warp_mma_k % 2], warp_frag_A_scale[warp_mma_k % 2]), + warp_frag_B[warp_mma_k % 2], + accum); + } + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Taken from +// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape1_, + /// Iterates over the intermediate accumulator tile in shared memory + typename WarpIteratorA1_, + /// whether or not to perform elementwise multiplication of A + // by another matrix (A_scale) that is also kept in shared memory prior + // to matmul A @ B + bool ScaleOperandA_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB1_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB1_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB1, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy1_, + /// Number of stages, + int Stages_, + int kMaxK_, + /// Used for partial specialization + typename Enable = bool> +class MmaMultistageFromSharedMemory : public MmaBaseFromSharedMemory< + Shape1_, + kMaxK_, + Policy1_, + Stages_, + typename WarpIteratorA1_::Layout> { + public: + ///< Base class + using Base = MmaBaseFromSharedMemory< + Shape1_, + kMaxK_, + Policy1_, + Stages_, + typename WarpIteratorA1_::Layout>; + + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape1 = Shape1_; + ///< Iterates over tiles of B operand in global memory + using IteratorB1 = IteratorB1_; + using IteratorB = IteratorB1; + ///< Policy describing tuning details + using Policy1 = Policy1_; + + using SmemIteratorB1 = SmemIteratorB1_; + using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate + ///< accumulator tile in shared memory + static constexpr bool ScaleOperandA = ScaleOperandA_; + + ///< warp level iterator over A_scale matrix tile kept in shared memory. + ///< if elementwise A scaling is disabled then everything this does is no-op. + using WarpIteratorAScale = typename cutlass::platform::conditional< + ScaleOperandA, + WarpIteratorA1, + NoOpWarpIteratorScale>::type; + ///< Data type of accumulator matrix + using ElementC = ElementC_; + ///< Layout of accumulator matrix + using LayoutC = LayoutC_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1; + static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + using FragmentC1 = typename Policy1::Operator::FragmentC; + using FragmentC = FragmentC1; + + /// Warp-level Mma + using Operator1 = typename Policy1::Operator; + + /// Minimum architecture is Sm80 to support cp.async + using ArchTag = arch::Sm80; + + /// Complex transform on B operand + static ComplexTransform const kTransformB1 = Operator1::kTransformB; + + /// Internal structure exposed for introspection. + struct Detail { + static_assert( + Base::kWarpGemmIterations1 > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of cp.async instructions to load one stage of operand B + static int const TBLoadIterationsB1 = + IteratorB1::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB1 = + (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) / + Base::kWarpGemmIterations1; + }; + + static constexpr int kNumStagesConcurrentLoad = + kSmemContainsEntireB ? Base::kStages : Base::kStages - 1; + + private: + using WarpLoadedFragmentA1 = typename Operator1::FragmentA; + /// fragment of OperandA scale matrix. if operand A scaling is disabled this + /// is (almost) empty. + using WarpLoadedFragmentA1Scale = typename WarpIteratorAScale::Fragment; + using WarpLoadedFragmentB1 = typename Operator1::FragmentB; + using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA; + using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB; + + /// applies elementwise scaling to fragment of A. if operand A scaling is + /// disabled this is a no-op. + using FragmentAScaler = FragmentElementwiseScaler< + WarpLoadedFragmentA1, + WarpLoadedFragmentA1Scale, + ScaleOperandA>; + + private: + // + // Data members + // + + /// Iterator to load a warp-scoped tile of A1 operand from intermediate + /// accumulator tile + WarpIteratorA1 warp_tile_iterator_A1_; + + /// Iterator to load a warp-scoped tile of A1_scale operand from shared memory + /// if operand A scaling is disabled everything this does is a no-op. + WarpIteratorAScale warp_tile_iterator_A1_scale_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB1 smem_iterator_B1_; + + bool prologue_done_; + + public: + /// constructor for MMA with operand A scaling enabled. + CUTLASS_DEVICE + MmaMultistageFromSharedMemory( + typename Base::TensorRefA a, + typename Base::TensorRefA a_scale, + typename Base::TensorRefB b_tile, + int thread_idx, + int warp_idx, + int lane_idx) + : Base(b_tile, thread_idx, warp_idx, lane_idx), + warp_tile_iterator_A1_(a, lane_idx), + warp_tile_iterator_A1_scale_(a_scale, lane_idx), + smem_iterator_B1_(b_tile, thread_idx), + prologue_done_(false) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + int warp_idx_mn_1 = + warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN); + int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN); + int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM; + int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM; + + // Add per-warp offsets in units of warp-level tiles + warp_tile_iterator_A1_.add_tile_offset( + {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1}); + warp_tile_iterator_A1_scale_.add_tile_offset( + {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1}); + } + + /// Construct from tensor references + CUTLASS_DEVICE + MmaMultistageFromSharedMemory( + typename Base::TensorRefA a, + typename Base::TensorRefB b_tile, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : Base(b_tile, thread_idx, warp_idx, lane_idx), + warp_tile_iterator_A1_(a, lane_idx), + smem_iterator_B1_(b_tile, thread_idx), + prologue_done_(false) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn_1 = + warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN); + int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN); + + int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM; + int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM; + + // Add per-warp offsets in units of warp-level tiles + warp_tile_iterator_A1_.add_tile_offset( + {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1}); + } + + CUTLASS_DEVICE + void set_prologue_done(bool value) { + prologue_done_ = value; + } + + CUTLASS_DEVICE + static void prologue( + typename Base::SharedStorage& shared_storage, + IteratorB iterator_B1, + int thread_idx, + int problem_size_0_n) { + SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx); + _prologue( + iterator_B1, + (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK, + smem_iterator_B1); + } + + CUTLASS_DEVICE + void copy_tiles_and_advance_1( + IteratorB1& iterator_B1, + int group_start_B1 = 0) { + iterator_B1.set_iteration_index( + group_start_B1 * IteratorB1::kAccessesPerVector); + this->smem_iterator_B1_.set_iteration_index(group_start_B1); + + // Load for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) { + if (group_start_B1 + j < Detail::TBLoadIterationsB1) { + typename IteratorB1::AccessType* dst_ptr = + reinterpret_cast( + this->smem_iterator_B1_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorB1::ThreadMap::kElementsPerAccess / + IteratorB1::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_B1.get(); + + cutlass::arch::cp_async_zfill( + dst_ptr + v, gmem_ptr, iterator_B1.valid()); + + ++iterator_B1; + } + ++this->smem_iterator_B1_; + } + } + } + + CUTLASS_DEVICE + static void _prologue( + IteratorB& iterator_B1, + int32_t gemm_k_iterations_1, + SmemIteratorB1& smem_iterator_B1_) { + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < kNumStagesConcurrentLoad; + ++stage, --gemm_k_iterations_1) { + iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1); + iterator_B1.clear_mask(gemm_k_iterations_1 == 0); + + iterator_B1.set_iteration_index(0); + smem_iterator_B1_.set_iteration_index(0); + + // Load for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) { + typename IteratorB1::AccessType* dst_ptr = + reinterpret_cast( + smem_iterator_B1_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorB1::ThreadMap::kElementsPerAccess / + IteratorB1::kAccessesPerVector / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_B1.get(), iterator_B1.valid()); + + ++iterator_B1; + } + + ++smem_iterator_B1_; + } + + // Move to the next stage + iterator_B1.add_tile_offset({1, 0}); + + smem_iterator_B1_.add_tile_offset({1, 0}); + + // Defines the boundary of a stage of cp.async. + cutlass::arch::cp_async_fence(); + } + iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1); + iterator_B1.clear_mask(gemm_k_iterations_1 == 0); + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations_1_, + ///< destination accumulator tile + FragmentC1& accum, + ///< iterator over B1 operand in global memory + IteratorB1 iterator_B1, + ///< initial value of accumulator + FragmentC1 const& src_accum) { + // 2nd Gemm + + // + // Prologue + // + // Perform accumulation in the 'd' output operand + accum = src_accum; + + if (!prologue_done_) { + _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_); + } else if (!kSmemContainsEntireB) { + // Restore the iterators increments + + int gemm_k_iterations_1 = gemm_k_iterations_1_; + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < kNumStagesConcurrentLoad; + ++stage, --gemm_k_iterations_1) { + iterator_B1.set_iteration_index(0); + this->smem_iterator_B1_.set_iteration_index(0); + + // Load for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) { + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) { + ++iterator_B1; + } + ++this->smem_iterator_B1_; + } + iterator_B1.add_tile_offset({1, 0}); + this->smem_iterator_B1_.add_tile_offset({1, 0}); + } + iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1); + iterator_B1.clear_mask(gemm_k_iterations_1 <= 0); + } + + // DEPBAR+SYNC + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // remember that WarpFragmentAScale and WarpIteratorAScale are no-op/empty + // if scaling is disabled. + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA1 warp_loaded_frag_A1[2]; + WarpLoadedFragmentA1Scale warp_loaded_frag_A1_scale[2]; + WarpLoadedFragmentB1 warp_loaded_frag_B1[2]; + WarpTransformedFragmentA1 warp_transformed_frag_A1[2]; + WarpTransformedFragmentB1 warp_transformed_frag_B1[2]; + + Operator1 warp_mma1; + + warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]); + ++warp_tile_iterator_A1_; + + warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[0]); + ++warp_tile_iterator_A1_scale_; + + this->warp_tile_iterator_B_.set_kgroup_index(0); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]); + ++this->warp_tile_iterator_B_; + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + warp_mma1.transform( + warp_transformed_frag_A1[0], + warp_transformed_frag_B1[0], + FragmentAScaler::apply( + warp_loaded_frag_A1[0], warp_loaded_frag_A1_scale[0]), + warp_loaded_frag_B1[0]); + + // tf32x3 kernels use staging accumulation. warp_mma uses a temporary + // accumulator and this temporary accumulator is added to the final + // accumulator once in every mainloop iteration. + plus plus_accum; + + FragmentC1 tmp_accum; + + if (platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + tmp_accum.clear(); + } + + // + // Mainloop + // + + CUTLASS_PRAGMA_UNROLL + for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1); + gemm_k_iterations_1 > (-Base::kStages + 1); + gemm_k_iterations_1--) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; + ++warp_mma_k) { + // Load warp-level tile from accumulator fragment (A) + // or shared memory (operand B) + this->warp_tile_iterator_B_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations1); + // skip warp tile loading for the last kgroup (we are out of the buf) + if (gemm_k_iterations_1 > (-Base::kStages + 2) || + warp_mma_k < Base::kWarpGemmIterations1 - 1) { + warp_tile_iterator_A1_.load( + warp_loaded_frag_A1[(warp_mma_k + 1) % 2]); + warp_tile_iterator_A1_scale_.load( + warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load( + warp_loaded_frag_B1[(warp_mma_k + 1) % 2]); + } + ++warp_tile_iterator_A1_; + ++warp_tile_iterator_A1_scale_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k > 0) + warp_mma1.transform( + warp_transformed_frag_A1[warp_mma_k % 2], + warp_transformed_frag_B1[warp_mma_k % 2], + FragmentAScaler::apply( + warp_loaded_frag_A1[warp_mma_k % 2], + warp_loaded_frag_A1_scale[warp_mma_k % 2]), + warp_loaded_frag_B1[warp_mma_k % 2]); + + if (platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + warp_mma1( + tmp_accum, + warp_transformed_frag_A1[warp_mma_k % 2], + warp_transformed_frag_B1[warp_mma_k % 2], + tmp_accum); + + if (warp_mma_k == 0) { + accum = plus_accum(accum, tmp_accum); + tmp_accum.clear(); + } + } else { + warp_mma1( + accum, + warp_transformed_frag_A1[warp_mma_k % 2], + warp_transformed_frag_B1[warp_mma_k % 2], + accum); + } + + // Issue global->shared copies for the this stage + if (warp_mma_k < Base::kWarpGemmIterations1 - 1) { + int group_start_iteration_B1; + + group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1; + + if (!kSmemContainsEntireB) { + copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1); + } + } + + if (warp_mma_k + 2 == Base::kWarpGemmIterations1) { + int group_start_iteration_B1; + group_start_iteration_B1 = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB1; + + if (!kSmemContainsEntireB) { + copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1); + } + + // Inserts a memory fence between stages of cp.async instructions. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages have committed. + arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_B1.add_tile_offset({1, 0}); + + this->smem_iterator_B1_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (!kSmemContainsEntireB) { + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy1::kPartitionsK * + Base::kWarpGemmIterations1, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + } + + iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2); + iterator_B1.clear_mask(gemm_k_iterations_1 == 1); + } + + // Do any conversions feeding the first stage at the end of the loop so + // we can start right away on mma instructions + if (warp_mma_k + 1 == Base::kWarpGemmIterations1) + warp_mma1.transform( + warp_transformed_frag_A1[(warp_mma_k + 1) % 2], + warp_transformed_frag_B1[(warp_mma_k + 1) % 2], + FragmentAScaler::apply( + warp_loaded_frag_A1[(warp_mma_k + 1) % 2], + warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]), + warp_loaded_frag_B1[(warp_mma_k + 1) % 2]); + } + } + + if (platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + accum = plus_accum(accum, tmp_accum); + } + } +}; + +// Converts a "regular" Mma into their counterpart from shared memory +template < + typename Mma_, + int kMaxK, + typename WarpIteratorA_, + /// whether or not to apply elementwise multiplication of operand A by + /// another matrix in shared memory before usage in A @ B + bool kScaleOperandA, + bool kTransposeA = false> +struct DefaultMmaFromSharedMemory; + +// Mma pipelined +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + typename WarpIteratorA_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Transformation applied to A operand + typename TransformA_, + /// Transformation applied to B operand + typename TransformB_, + // Max MMA problem size K + int kMaxK, + /// whether or not to apply elementwise multiplication of operand A by + /// another matrix in shared memory before usage in A @ B + bool kScaleOperandA, + bool kTransposeA> +struct DefaultMmaFromSharedMemory< + MmaPipelined< + Shape_, + IteratorA_, + SmemIteratorA_, + IteratorB_, + SmemIteratorB_, + ElementC_, + LayoutC_, + Policy_, + TransformA_, + TransformB_>, + kMaxK, + WarpIteratorA_, + kScaleOperandA, + kTransposeA> { + using RegularMma = MmaPipelined< + Shape_, + IteratorA_, + SmemIteratorA_, + IteratorB_, + SmemIteratorB_, + ElementC_, + LayoutC_, + Policy_, + TransformA_, + TransformB_>; + + using WarpShape = typename Policy_::Operator::Shape; + using InstructionShape = typename Policy_::Operator::InstructionShape; + using ArchMmaOperator = typename Policy_::Operator; + + static constexpr bool kIsTransposedA = false; + using WarpIteratorA = WarpIteratorA_; + using IteratorB = + typename cutlass::transform::threadblock::MakeIteratorResidualLast< + IteratorB_>::Iterator; + + using Mma = typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory< + Shape_, + WarpIteratorA, + kScaleOperandA, + kMaxK, + IteratorB, + SmemIteratorB_, + ElementC_, + LayoutC_, + Policy_>; +}; + +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + typename WarpIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Use zfill or predicate for out-of-bound cp.async + SharedMemoryClearOption SharedMemoryClear, + int kMaxK, + /// whether or not to apply elementwise multiplication of operand A by + /// another matrix in shared memory before usage in A @ B + bool kScaleOperandA, + bool kTransposeA> +struct DefaultMmaFromSharedMemory< + MmaMultistage< + Shape_, + IteratorA_, + SmemIteratorA_, + CacheOpA, + IteratorB_, + SmemIteratorB_, + CacheOpB, + ElementC_, + LayoutC_, + Policy_, + Stages, + SharedMemoryClear>, + kMaxK, + WarpIteratorA_, + kScaleOperandA, + kTransposeA> { + using RegularMma = MmaMultistage< + Shape_, + IteratorA_, + SmemIteratorA_, + CacheOpA, + IteratorB_, + SmemIteratorB_, + CacheOpB, + ElementC_, + LayoutC_, + Policy_, + Stages, + SharedMemoryClear>; + + using WarpShape = typename Policy_::Operator::Shape; + using InstructionShape = typename Policy_::Operator::InstructionShape; + using WarpIteratorTranspose = TransposeWarpIterator; + static constexpr bool kIsTransposedA = + WarpIteratorTranspose::kSupportsTranspose && kTransposeA; + using WarpIteratorA = typename platform::conditional< + kIsTransposedA, + typename WarpIteratorTranspose::Iterator, + WarpIteratorA_>::type; + + // Reduce the number of stages if we don't need that many + static int constexpr kStagesMax = + (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK); + static int constexpr kStages = cutlass::const_min(Stages, kStagesMax); + + using IteratorB = + typename cutlass::transform::threadblock::MakeIteratorResidualLast< + IteratorB_>::Iterator; + using Mma = + typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory< + Shape_, + WarpIteratorA, + kScaleOperandA, + IteratorB, + SmemIteratorB_, + RegularMma::kCacheOpB, + ElementC_, + LayoutC_, + Policy_, + kStages, + kMaxK>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename IteratorC, + typename Operator, + typename scalar_t, + typename WarpShape_, + typename ThreadblockShape_> +struct B2bGemm; + +// Tensor Cores >= Sm75 specialization (Ampere ...) +template < /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Element type + typename Element_, + /// Layout of operand in memory + typename Layout_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions, concept: MatrixShape) + typename OpDelta_, + typename Operator, + typename scalar_t, + typename WarpShape_, + typename ThreadblockShape_> +struct B2bGemm< + cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< + Shape_, + Element_, + Layout_, + InstructionShape_, + OpDelta_>, + Operator, + scalar_t, + WarpShape_, + ThreadblockShape_> { + using IteratorC = + typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< + Shape_, + Element_, + Layout_, + InstructionShape_, + OpDelta_>; + using FragmentC = typename IteratorC::Fragment; + using InstructionShape = InstructionShape_; + using WarpShape = WarpShape_; + using ThreadblockShape = ThreadblockShape_; + using accum_t = Element_; + using lse_scalar_t = float; + + using SmemAccumulatorLayout = cutlass::layout::RowMajor; + + // Iterator to load accumulators (results of matmul in registers) + using FragmentIteratorAccumulator = + cutlass::epilogue::warp::FragmentIteratorTensorOp< + WarpShape, + InstructionShape, + accum_t, + typename Operator::Policy::Operator::FragmentC, + cutlass::layout::RowMajor>; + + // Iterator to store to shared-memory + using SmemIteratorD0 = typename cutlass::epilogue::warp::TileIteratorTensorOp< + WarpShape, + InstructionShape, + scalar_t, // accum_t, + SmemAccumulatorLayout>; + using AccumulatorSharedStorage = + cutlass::gemm::threadblock::AccumulatorSharedStorage< + ThreadblockShape, + typename SmemIteratorD0::Element, + typename SmemIteratorD0::TensorLayout, + typename SmemIteratorD0::Padding>; + // We need to provide an operation for the epilogue. Let's create an + // operation that does nothing (ScaleType::Nothing), just converts + // from accum_t (float) -> scalar_t (can be half) + using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination< + typename SmemIteratorD0::Element, // ElementOutput + FragmentIteratorAccumulator::Fragment::kElements, + accum_t, // ElementAccumulator + typename SmemIteratorD0::Element, // ElementCompute + cutlass::epilogue::thread::ScaleType::Nothing>; + using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator< + SmemIteratorD0, + FragmentIteratorAccumulator, + SmemIteratorD0, // ScaleBiasIterator - not used + OutputOpNoOp>; + + // Epilogue 2: with LSE (for backwards pass) + static int const kElementsPerAccess = 2; // TODO: Why 2? + using IteratorAccumulatorLSE = + cutlass::transform::threadblock::VectorIterator< + cutlass::transform::threadblock::PredicatedVectorAccessIterator< + // Shape + cutlass::MatrixShape, + // WarpShape + cutlass::MatrixShape, + lse_scalar_t, + cutlass::layout::RowMajor, + kElementsPerAccess>>; + using EpilogueOpApplyLSE = cutlass::epilogue::thread::ApplyLogSumExp< + scalar_t, // ElementOutput_ + lse_scalar_t, // ElementLSE_ + accum_t, // ElementAccumulator_ + accum_t, // ElementCompute_ + 128 / cutlass::sizeof_bits::value + // FragmentIteratorAccumulator::Fragment::kElements + // InstructionShape::kM * InstructionShape::kN / 32 + >; + using EpilogueWithLSE = + cutlass::epilogue::threadblock::EpilogueSmemAccumulator< + SmemIteratorD0, + FragmentIteratorAccumulator, + IteratorAccumulatorLSE, + EpilogueOpApplyLSE>; + + static void CUTLASS_DEVICE accumToSmem( + AccumulatorSharedStorage& shared_storage, + FragmentC const& accum, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id); + smem_iterator_attn.add_tile_offset( + tile_coords * + cutlass::MatrixCoord{ + SmemIteratorD0::TileIterations::kRow, + SmemIteratorD0::TileIterations::kColumn}); + Epilogue epilogue; + epilogue(OutputOpNoOp({}), smem_iterator_attn, accum); + } + + static void CUTLASS_DEVICE accumApplyLSEToSmem( + AccumulatorSharedStorage& shared_storage, + FragmentC& accum, + lse_scalar_t const* lse, + int32_t lse_extents, + int thread_id, + int warp_id, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + constexpr int32_t kAlignLSE = 32; + IteratorAccumulatorLSE iterator_lse( + lse, + {(int32_t)0, (int32_t)ceil_div(lse_extents, kAlignLSE) * kAlignLSE}, + thread_id, + warp_id, + cutlass::MatrixCoord{0, 0} // offset + ); + + SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id); + smem_iterator_attn.add_tile_offset( + tile_coords * + cutlass::MatrixCoord{ + SmemIteratorD0::TileIterations::kRow, + SmemIteratorD0::TileIterations::kColumn}); + EpilogueWithLSE epilogue; + EpilogueOpApplyLSE minus_lse_exp({}); + epilogue( + minus_lse_exp, + smem_iterator_attn, + accum, + // scale - unused + iterator_lse, + // bias + iterator_lse); + } +}; + +// Volta Specialization +// only supported for f16 +template +struct B2bGemm< + cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator< + cutlass::MatrixShape<32, 32>, + float, + cutlass::layout::RowMajor, + cutlass::gemm::GemmShape<16, 16, 4>, + cutlass::MatrixShape<1, 1>>, + Operator, + cutlass::half_t, + WarpShape_, + ThreadblockShape_> { + using IteratorC = + cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator< + cutlass::MatrixShape<32, 32>, + float, + cutlass::layout::RowMajor, + cutlass::gemm::GemmShape<16, 16, 4>, + cutlass::MatrixShape<1, 1>>; + using scalar_t = cutlass::half_t; + using accum_t = IteratorC::Element; + using WarpShape = WarpShape_; + using ThreadblockShape = ThreadblockShape_; + using FragmentC = IteratorC::Fragment; + using lse_scalar_t = float; + + // Storage in shared-memory for Q.Kt + using SmemAccumulatorLayout = + cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>; + using AccumulatorSharedStorage = + cutlass::gemm::threadblock::AccumulatorSharedStorage< + ThreadblockShape, + scalar_t, + SmemAccumulatorLayout, + cutlass::MatrixShape<0, 0> // Padding + >; + using TensorRef = cutlass::TensorRef; + using Policy = typename IteratorC::Policy; + using Element = accum_t; + // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields + // Let's copy their values + static int const kElementsPerPartial = 4; + using EleShapePerPatial = typename cutlass::platform::conditional< + cutlass::platform::is_same::value, + cutlass::MatrixShape<2, 2>, + cutlass::MatrixShape<1, 4>>::type; + static int const kElementsPerMma = 8; + static int const kAccumulatorPatials = 2; + using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>; + + static void CUTLASS_DEVICE accumToSmem( + AccumulatorSharedStorage& shared_storage, + FragmentC const& accum, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + // ctor - from MmaVoltaTensorOpAccumulatorTileIterator + TensorRef ref_(shared_storage.accum_ref()); + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + int accum_m, accum_n; + + if (cutlass::platform::is_same::value) { + // (quad[2],quad[0])+lane_in_quad[0] + accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1); + // (quad[1])+lane_in_quad[1] + accum_n = + ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials + + (lane_in_quad & 2); + } else { + accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + + lane_in_quad; // (quad[2],quad[0]) + accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials; + } + cutlass::MatrixCoord lane_offset(accum_m, accum_n); + + // Tile offset + ref_.add_coord_offset( + tile_coords * + cutlass::MatrixCoord( + {IteratorC::Shape::kRow, IteratorC::Shape::kColumn})); + + using AccessType = cutlass::Array; + + // store - from MmaVoltaTensorOpAccumulatorTileIterator + CUTLASS_PRAGMA_UNROLL + for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) { + CUTLASS_PRAGMA_UNROLL + for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) { + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + int mma_accum_start = + (((tile_n * Policy::TileIterations::kRow + tile_m) * + Policy::MmaIterations::kColumn + + mma_n) * + Policy::MmaIterations::kRow + + mma_m) * + kElementsPerMma; + + CUTLASS_PRAGMA_UNROLL + for (int p = 0; p < kAccumulatorPatials; ++p) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < EleShapePerPatial::kRow; ++m) { + int accum_m = tile_m * Policy::InterleavedTile::kRow + + mma_m * QuadShapePerPatialMma::kRow + m * 2; + int accum_n = tile_n * Policy::InterleavedTile::kColumn + + mma_n * QuadShapePerPatialMma::kColumn + + p * Policy::InterleavedTile::kColumn / 2; + int r = (accum_m + lane_offset.row()); + AccessType to_store; + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < EleShapePerPatial::kColumn; ++n) { + int idx = mma_accum_start + p * kElementsPerPartial + + m * EleShapePerPatial::kColumn + n; + int c = (accum_n + n + lane_offset.column()); + to_store[n] = scalar_t(accum[idx]); + } + int c = (accum_n + lane_offset.column()); + assert(r < 32); + assert(c < 32); + *reinterpret_cast( + ref_.data() + ref_.offset({r, c})) = to_store; + } + } + } + } + } + } + } + + static void CUTLASS_DEVICE accumApplyLSEToSmem( + AccumulatorSharedStorage& shared_storage, + typename IteratorC::Fragment& accum, + lse_scalar_t const* lse, + int lse_extent, + int thread_id, + int warp_id, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + // Non-optimized way to apply LSE to registers + // NOTE: accum is attn.T + // TODO: Optimize for each architecture + static constexpr int WarpSize = 32; + using AccumLambdaIterator = + typename DefaultMmaAccumLambdaIterator:: + Iterator; + auto lane_offset = + AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords); + + cutlass::Array lse_prefetched; + lse_prefetched.clear(); + int rowIdx = 0; + int colIdx = 0; + AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { + ++rowIdx; + colIdx = 0; + }, + [&](int accum_m, int accum_n, int idx) { + if (rowIdx == 1) { + lse_prefetched[colIdx] = accum_n < lse_extent + ? lse[accum_n] + : platform::numeric_limits::infinity(); + } + accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]); + ++colIdx; + }, + [&](int accum_m) {}); + accumToSmem(shared_storage, accum, lane_id, tile_coords); + } +}; + +// Simt Specialization +// for f32 on Sm70-Sm75 and f16/f32 below + +template < + typename Operator, + typename OperatorPolicy, + typename scalar_t, + typename WarpShape_, + typename ThreadblockShape_> +struct B2bGemm< + cutlass::gemm::warp::MmaSimtTileIterator< + cutlass::MatrixShape<32, 32>, + cutlass::gemm::Operand::kC, + float, + cutlass::layout::RowMajor, + OperatorPolicy, + 1, + 1>, + Operator, + scalar_t, + WarpShape_, + ThreadblockShape_> { + using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator< + cutlass::MatrixShape<32, 32>, + cutlass::gemm::Operand::kC, + float, + cutlass::layout::RowMajor, + OperatorPolicy, + 1, + 1>; + using accum_t = typename IteratorC::Element; + using WarpShape = WarpShape_; + using ThreadblockShape = ThreadblockShape_; + using FragmentC = typename IteratorC::Fragment; + using lse_scalar_t = float; + + // Storage in shared-memory for Q.Kt + using AccumulatorSharedStorage = + cutlass::gemm::threadblock::AccumulatorSharedStorage< + ThreadblockShape, + scalar_t, + cutlass::layout::ColumnMajor, + cutlass::MatrixShape<0, 0> // Padding + >; + + static void CUTLASS_DEVICE accumToSmem( + AccumulatorSharedStorage& shared_storage, + FragmentC const& accum, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + using Policy = typename IteratorC::Policy; + using Element = typename IteratorC::Element; + using Iterations = typename IteratorC::Iterations; + using Delta = typename IteratorC::Delta; + + auto ref_ = shared_storage.accum_ref(); + // ctor - MmaSimtTileIterator + // compute offset based on thread ID and lane layout + typename Policy::LaneLayout lane_layout = Policy::get_lane_layout(); + + MatrixCoord lane_offset = lane_layout.inverse(lane_id) * + MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN); + + ref_.add_coord_offset(lane_offset); + + // Tile offset + ref_.add_coord_offset( + tile_coords * + cutlass::MatrixCoord( + {IteratorC::Shape::kRow, IteratorC::Shape::kColumn})); + + // store - MmaSimtTileIterator + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) { + int r = + Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) + + m; + int c = mma_n * Delta::kColumn + n; + int idx = n + + Policy::LaneMmaShape::kN * + (mma_n + + Iterations::kColumn * + (m + mma_m * Policy::LaneMmaShape::kM)); + ref_.at({r, c}) = scalar_t(accum[idx]); + } + } + } + } + } + + static void CUTLASS_DEVICE accumApplyLSEToSmem( + AccumulatorSharedStorage& shared_storage, + typename IteratorC::Fragment& accum, + lse_scalar_t const* lse, + int lse_extent, + int thread_id, + int warp_id, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + // Non-optimized way to apply LSE to registers + // NOTE: accum is attn.T + // TODO: Optimize for each architecture + static constexpr int WarpSize = 32; + using AccumLambdaIterator = + typename DefaultMmaAccumLambdaIterator:: + Iterator; + auto lane_offset = + AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords); + + cutlass::Array lse_prefetched; + lse_prefetched.clear(); + int rowIdx = 0; + int colIdx = 0; + AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { + ++rowIdx; + colIdx = 0; + }, + [&](int accum_m, int accum_n, int idx) { + if (rowIdx == 1) { + lse_prefetched[colIdx] = accum_n < lse_extent + ? lse[accum_n] + : platform::numeric_limits::infinity(); + } + accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]); + ++colIdx; + }, + [&](int accum_m) {}); + accumToSmem(shared_storage, accum, lane_id, tile_coords); + } +}; + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..7d5068ca56480f6e99198b0003aa7d87af68eaab --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h @@ -0,0 +1,214 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include + +//////////////////////////////////////////////////////////////////////////////// +// Some helper functions +//////////////////////////////////////////////////////////////////////////////// +#define DISPATCH_TYPES(tensor, func) \ + { \ + if (query.scalar_type() == at::ScalarType::Float) { \ + using scalar_t = float; \ + func(); \ + } else if (query.scalar_type() == at::ScalarType::Half) { \ + using scalar_t = cutlass::half_t; \ + func(); \ + } else if (query.scalar_type() == at::ScalarType::BFloat16) { \ + using scalar_t = cutlass::bfloat16_t; \ + func(); \ + } else { \ + TORCH_CHECK(false, "Only fp32, half & bf16 supported at the moment"); \ + } \ + } + +#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F) \ + { \ + if (BOOL_V) { \ + constexpr bool BOOL_NAME = true; \ + F(); \ + } else { \ + constexpr bool BOOL_NAME = false; \ + F(); \ + } \ + } +#define DISPATCH_ARCHTAG(CC, func) \ + { \ + if (CC >= 80) { \ + using ArchTag = cutlass::arch::Sm80; \ + func(); \ + } else if (CC >= 75) { \ + using ArchTag = cutlass::arch::Sm75; \ + func(); \ + } else if (CC >= 70) { \ + using ArchTag = cutlass::arch::Sm70; \ + func(); \ + } else if (CC >= 50) { \ + using ArchTag = cutlass::arch::Sm50; \ + func(); \ + } else { \ + TORCH_CHECK( \ + false, \ + "Your device is too old. We require compute capability >= 50"); \ + } \ + } + +#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK(TENSOR.is_contiguous()); + +#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK( \ + TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous"); + +#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \ + TORCH_CHECK( \ + uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned") + +#define ASSIGN_CHECK_OVERFLOW(A, B) \ + { \ + A = B; \ + TORCH_CHECK( \ + B < std::numeric_limits::max(), #B " overflows"); \ + } + +namespace gemm_kernel_utils { + +template +constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m) { + return (n + m - 1) / m; +} + +template +constexpr CUTLASS_HOST_DEVICE integer align_up(integer n, integer m) { + return ((n + m - 1) / m) * m; +} + +//////////////////////////////////////////////////////////////////////////////// +// Determine the type of GEMM we do (TensorCores or not, Shapes ...) +// TODO: Maybe we could rely on Cutlass's DefaultGemm templates +//////////////////////////////////////////////////////////////////////////////// + +// Fallback to Simt (FMA on cuda cores) if not in a special case below +template +struct DefaultGemmType { + static constexpr int ThreadK = 8; + static constexpr int WarpK = 8; + static constexpr int kMinimumAlignment = 1; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + using OpClass = cutlass::arch::OpClassSimt; + using Operator = cutlass::arch::OpMultiplyAdd; +}; + +// Specialization for tensorcores with f32 +template +struct DefaultGemmType< + ArchTag, + float, + typename cutlass::platform::enable_if< + ArchTag::kMinComputeCapability >= 80>::type> { + static constexpr int ThreadK = 32; + static constexpr int WarpK = 32; + static constexpr int kMinimumAlignment = 4; + using OpClass = cutlass::arch::OpClassTensorOp; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Operator = cutlass::arch::OpMultiplyAddFastF32; +}; + +// Specialization for tensorcores with f16/bf16 - Sm75+ +template +struct DefaultGemmType< + ArchTag, + scalar_t, + typename cutlass::platform::enable_if< + ArchTag::kMinComputeCapability >= 75 && + cutlass::sizeof_bits::value == 16>::type> { + static constexpr int ThreadK = 32; + static constexpr int WarpK = 32; + static constexpr int kMinimumAlignment = 4; + using OpClass = cutlass::arch::OpClassTensorOp; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Operator = cutlass::arch::OpMultiplyAdd; +}; + +// Specialization for tensorcores with f16 - Volta +template <> +struct DefaultGemmType { + static constexpr int ThreadK = 32; + static constexpr int WarpK = 32; + static constexpr int kMinimumAlignment = 2; + using OpClass = cutlass::arch::OpClassTensorOp; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Operator = cutlass::arch::OpMultiplyAdd; +}; + +// Enables to do +// `auto x = kCondition ? fa(arg) : fb(arg)` +// when `fa` and `fb` have different types +template +struct call_conditional; + +template +struct call_conditional { + template + static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg) + -> decltype(ta(arg)) { + return ta(arg); + } +}; + +template +struct call_conditional { + template + static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg) + -> decltype(tb(arg)) { + return tb(arg); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Mark a variable as warp-uniform - enables some compiler optimizations +// The cheapest way to do it is just to broadcast it from lane 0 +//////////////////////////////////////////////////////////////////////////////// + +template +CUTLASS_DEVICE T warp_uniform(T value) { + struct { + union { + T value; + uint32_t asInt; + }; + } p; + p.value = value; + p.asInt = __shfl_sync(0xffffffff, (unsigned)p.asInt, 0); + return p.value; +} + +template +CUTLASS_DEVICE T* warp_uniform(T* ptr) { + struct { + union { + T* ptr; + uint32_t asInt[2]; + }; + } p; + p.ptr = ptr; + p.asInt[0] = warp_uniform(p.asInt[0]); + p.asInt[1] = warp_uniform(p.asInt[1]); + return p.ptr; +} +} // namespace gemm_kernel_utils + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..6fb3a6a1dc8297ee1e3ba547ee2df92c3ce6d8fe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h @@ -0,0 +1,2619 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +using namespace gemm_kernel_utils; + +namespace PyTorchMemEffAttention { +namespace { + +template +struct GmemTile { + /* + Helper functions to efficient store/load RF to gmem + + GEMM accumulators have a particular format on A100, and + it takes some compute/shared-memory to rearrange them to + a RowMajor or ColumnMajor format in global memory through + an Epilogue. The same complexity goes for loading into RF. + + This class loads/stores RF as they are, and can be used for + efficient accumulation across gemms for instance: + + ``` + GmemTile tile; + for (int i = 0; i < N; ++i) { + // ... + + Fragment accum; + if (i == 0) { + accum.clear(); + } else { + tile.load(accum); + } + mma(accum, ...); + if (i < N-1) { + // Store for next GEMM + tile.store(accum); + } else { + // Store in tensor (eg RowMajor) + epilogue(accum); + } + + // ... + } + ``` + */ + + // 128bits per thread + using AccessType = cutlass::Array; + static constexpr int32_t kBytes = sizeof(AccessType); + static constexpr int32_t kStride = kNumThreads * AccessType::kElements; + static constexpr int32_t kNumIters = + FragmentType::kElements / AccessType::kElements; + static constexpr int32_t kElementsStored = + kNumThreads * FragmentType::kElements; + static_assert( + FragmentType::kElements % AccessType::kElements == 0, + "fragment not aligned on 128 bits"); + + float* ptr; + + CUTLASS_DEVICE void load(FragmentType& fragment, int thread_id) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kNumIters; ++i) { + AccessType* __restrict__ gmem_ptr = reinterpret_cast( + ptr + thread_id * AccessType::kElements + i * kStride); + AccessType sub_fragment; + cutlass::arch::global_load( + sub_fragment, gmem_ptr, true); + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < AccessType::kElements; ++j) { + fragment[i * AccessType::kElements + j] = sub_fragment[j]; + } + } + } + + CUTLASS_DEVICE void store(FragmentType const& fragment, int thread_id) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kNumIters; ++i) { + AccessType* __restrict__ gmem_ptr = reinterpret_cast( + ptr + thread_id * AccessType::kElements + i * kStride); + AccessType sub_fragment; + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < AccessType::kElements; ++j) { + sub_fragment[j] = fragment[i * AccessType::kElements + j]; + } + cutlass::arch::global_store( + sub_fragment, gmem_ptr, true); + } + } + + CUTLASS_DEVICE void storeAtomicAdd( + FragmentType const& fragment, + int thread_id) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kNumIters; ++i) { + float* gmem_ptr = ptr + thread_id * AccessType::kElements + i * kStride; + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < AccessType::kElements; ++j) { + float val = fragment[i * AccessType::kElements + j]; + float* ptr = gmem_ptr + j; + atomicAdd(ptr, val); + } + } + } +}; + +struct AtomicLock { + CUTLASS_DEVICE static void acquire( + int32_t* lock, + int set_val, + int thread_id) { + if (thread_id == 0) { + while (atomicCAS(lock, 0 /*cmp*/, set_val /*setval*/) != set_val) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + __nanosleep(40); +#endif + } + } + __syncthreads(); + } + CUTLASS_DEVICE static void release(int32_t* lock, int thread_id) { + if (thread_id == 0) { + int status = 0; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + asm volatile("st.global.release.gpu.b32 [%0], %1;\n" + : + : "l"(lock), "r"(status)); +#else + asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status)); +#endif + } + } +}; + +template +constexpr int getWarpsPerSmBw() { + bool is_half = !cutlass::platform::is_same::value; + if (Arch::kMinComputeCapability >= 80) { + return is_half ? 12 : 8; + } + return 8; +} +} // namespace + +template < + // which arch we target (eg `cutlass::arch::Sm80`) + typename ArchTag_, + // input/output type + typename scalar_t_, + // run optimized kernel because memory accesses will be aligned + bool kIsAligned_, + // use dropout if enabled + bool kApplyDropout_, + // when doing a GEMM, preload the next one (uses more shmem) + bool kPreload_, + // block dimensions + int kBlockSizeI_, + int kBlockSizeJ_, + // upperbound on `max(value.shape[-1], query.shape[-1])` + int kMaxK_ = (int)cutlass::platform::numeric_limits::max(), + // assumes that `cu_seqlen` is None, and + // (1) `num_queries % kBlockSizeI == 0` + // (2) `num_keys % kBlockSizeJ == 0` + bool kKeysQueriesAlignedToBlockSize_ = false> +struct AttentionBackwardKernel { + enum CustomMaskType { + NoCustomMask = 0, + CausalFromTopLeft = 1, + CausalFromBottomRight = 2, + NumCustomMaskTypes, + }; + using scalar_t = scalar_t_; + using output_t = scalar_t; + using output_accum_t = float; + using lse_scalar_t = float; + using accum_t = float; + using ArchTag = ArchTag_; + static constexpr bool kIsAligned = kIsAligned_; + static constexpr bool kApplyDropout = kApplyDropout_; + static constexpr bool kPreload = kPreload_; + static constexpr int kBlockSizeI = kBlockSizeI_; + static constexpr int kBlockSizeJ = kBlockSizeJ_; + static constexpr int kMaxK = kMaxK_; + static constexpr bool kKeysQueriesAlignedToBlockSize = + kKeysQueriesAlignedToBlockSize_; + + static constexpr int64_t kWarpSize = 32; + + // If this is true, we store and accumulate dK/dV in RF + // rather than going back to gmem every time + static constexpr bool kIsHalf = cutlass::sizeof_bits::value <= 16; + static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI; + static_assert( + !kPreload || + (kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF), + "preload MMA not supported"); + static constexpr bool kPrologueQK = kPreload; + static constexpr bool kPrologueGV = kPreload; + static constexpr bool kPrologueDOV = kPreload; + static constexpr bool kPrologueGQ = kPreload; + static constexpr bool kPrologueGK = kPreload; + + static constexpr int64_t kNumWarpsPerBlock = + (kBlockSizeI * kBlockSizeJ) / (32 * 32); + + // Compute delta for the f16 kernels + // TODO: Figure out why it's slower on the f32 kernels + // (something due to RF pressure?) + // TODO: Remove condition on `kOutputInRF` - this is needed to work + // around a compiler bug on V100, not exactly sure why but I spent + // too much time on this already. Reproducible with + // (B, Mq, Mkv, K) = (1, 1, 1, 136) for instance + static constexpr bool kKernelComputesDelta = + kIsHalf && (kOutputInRF || ArchTag::kMinComputeCapability != 70); + + // Launch bounds + static constexpr int64_t kNumThreads = kWarpSize * kNumWarpsPerBlock; + static constexpr int64_t kMinBlocksPerSm = + getWarpsPerSmBw() / kNumWarpsPerBlock; + + using GemmType = DefaultGemmType; + using DefaultConfig = + typename cutlass::gemm::device::DefaultGemmConfiguration< + typename GemmType::OpClass, + ArchTag, + scalar_t, + scalar_t, + scalar_t, // ElementC + accum_t // ElementAccumulator + >; + static constexpr auto kOptimalAlignement = cutlass::platform::max( + DefaultConfig::kAlignmentA, + DefaultConfig::kAlignmentB); + static constexpr auto kMinimumAlignment = GemmType::kMinimumAlignment; + + struct MatmulQK { + /* + attn_T = k_j @ q_i.transpose(-2, -1) # matmul + attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2, + -1)).exp() # epilogue + + with attn_T.shape = (kBlockSizeJ, kBlockSizeI) + */ + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using DefaultMma = typename cutlass::gemm::threadblock::DefaultMma< + scalar_t, // ElementA + cutlass::layout::RowMajor, // LayoutA + kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment, + scalar_t, // ElementB + cutlass::layout::ColumnMajor, // LayoutB + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + accum_t, // ElementC + cutlass::layout::RowMajor, // LayoutC + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + DefaultConfig::kStages, + typename GemmType::Operator, + false, // AccumulatorsInRowMajor = false, + cutlass::gemm::SharedMemoryClearOption::kNone>; + using MmaCore = typename DefaultMma::MmaCore; + using Mma = + typename MakeCustomMma::Mma; + + // used for efficient load of bias tile (Bij) from global memory to shared + // memory + using BiasLoader = TileSmemLoader< + scalar_t, + // Bij is applied to transposed attn matrix tile (Pij.T). Bij is loaded + // row-major but needs to have transposed shape so we get the same + // elements. + cutlass::MatrixShape, + MmaCore::kThreads, + // input restriction: kv_len has to be a multiple of this value + 128 / cutlass::sizeof_bits::value>; + + // Epilogue to store to shared-memory in a format that we can use later for + // the second matmul + using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm< + typename Mma::Operator::IteratorC, + typename Mma::Operator, + scalar_t, + WarpShape, + ThreadblockShape>; + using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator< + typename Mma::Operator::IteratorC, + accum_t, + kWarpSize>::Iterator; + using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage; + }; + + struct MatmulGradV { + /* + grad_v[j_start:j_end] += attn_T @ do_i # matmul + + Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K) + (we might need to iterate multiple times on K) + */ + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using InstructionShape = typename GemmType::InstructionShape; + + using DefaultGemm = cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + DefaultConfig::kAlignmentA, + scalar_t, // ElementB, + cutlass::layout::RowMajor, // LayoutB, + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + output_t, + cutlass::layout::RowMajor, // LayoutC, + accum_t, + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + typename DefaultConfig::EpilogueOutputOp, + void, // ThreadblockSwizzle - not used + DefaultConfig::kStages, + false, // SplitKSerial + typename GemmType::Operator>; + + // if dropout: + // for computing dVj += (Pij.T * Zij) @ dOi + // Pij_dropped.T = Pij.T * Zij is computed on the fly as fragments of + // Pij.T are loaded in. The reason we do it this way is because Pij.T and + // Zij are reused in later steps, while Pij_dropped.T is only needed in + // this step. computing Pij_dropped.T on the fly allows us to avoid + // keeping all 3 of Pij_dropped.T, Pij.T, and Zij in shared memory at the + // same time. + // if no dropout: + // for computing dVj += Pij.T @ dOi + using WarpIteratorA = typename cutlass::gemm::threadblock:: + DefaultWarpIteratorAFromSharedMemory< + typename DefaultGemm::Mma::Operator::Shape, // WarpShape + typename DefaultGemm::Mma::Operator:: + InstructionShape, // InstructionShape + typename DefaultGemm::Mma::Operator:: + IteratorA, // RegularWarpIterator + typename DefaultGemm::Mma::Policy // Policy + >::WarpIterator; + using DefaultMmaFromSmem = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MatmulQK::AccumulatorSharedStorage::Shape::kN, + WarpIteratorA, + kApplyDropout>; // kScaleOperandA + + using Mma = typename DefaultMmaFromSmem::Mma; + using IteratorB = typename Mma::IteratorB; + using WarpCount = typename Mma::WarpCount; + + // Epilogue + using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp; + using DefaultEpilogue = typename DefaultGemm::Epilogue; + using OutputTileIterator = + typename cutlass::epilogue::threadblock::MakePrefetchableIterator< + typename DefaultEpilogue::OutputTileIterator>::Iterator; + using AccumTileGmem = GmemTile; + }; + + struct MatmulDOIVJ { + /* + doi_t_vj = do_i @ v_j.transpose(-2, -1) # matmul + tmp = (doi_t_vj - Di.unsqueeze(1)) * attn # inplace / epilogue? + */ + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + + using ElementC = output_t; + using ElementAccum = accum_t; + + // no-op output op - epilogue just stores result to global memory + using BiasGradEpilogueOutputOp = + typename cutlass::epilogue::thread::LinearCombination< + ElementC, + DefaultConfig::EpilogueOutputOp::kCount, + typename DefaultConfig::EpilogueOutputOp::ElementAccumulator, + typename DefaultConfig::EpilogueOutputOp::ElementCompute, + cutlass::epilogue::thread::ScaleType::Nothing>; + + using DefaultGemm = typename cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA + cutlass::layout::RowMajor, // LayoutA + kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment, + scalar_t, // ElementB + cutlass::layout::ColumnMajor, // LayoutB + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + ElementC, // ElementC + cutlass::layout::RowMajor, // LayoutC + ElementAccum, // ElementAccumulator + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + BiasGradEpilogueOutputOp, // EpilogueOutputOp + void, // ThreadblockSwizzle (not used) + // multiple preloads, dropout Zij tile, and 3 stages push us over shared + // memory capacity on A100. set a ceiling on number of stages to save + // shared memory if dropout is in use. + kPreload && kApplyDropout && (kBlockSizeI * kBlockSizeJ > 64 * 64) + ? cutlass::const_min(2, DefaultConfig::kStages) + : DefaultConfig::kStages, // Stages + false, // SplitKSerial + typename GemmType::Operator, + cutlass::gemm::SharedMemoryClearOption::kNone>; + using Mma = typename MakeCustomMma::Mma; + using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator< + typename Mma::Operator::IteratorC, + ElementAccum, + kWarpSize>::Iterator; + + // epilogue used to write bias gradient, which is just the output of this + // matmul with some operations applied to the fragment + using BiasGradEpilogue = typename DefaultGemm::Epilogue; + + // Epilogue to store to shared-memory in a format that we can use later for + // the second matmul + using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm< + typename DefaultGemm::Mma::Operator::IteratorC, + typename DefaultGemm::Mma::Operator, + scalar_t, + WarpShape, + ThreadblockShape>; + using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage; + }; + + struct MatmulGradQ { + // grad_q <- tmp @ k_j + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using InstructionShape = typename GemmType::InstructionShape; + + using DefaultGemm = cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + DefaultConfig::kAlignmentA, + scalar_t, // ElementB, + cutlass::layout::RowMajor, // LayoutB, + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + output_t, + cutlass::layout::RowMajor, // LayoutC, + accum_t, + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + typename DefaultConfig::EpilogueOutputOp, + void, // ThreadblockSwizzle - not used + DefaultConfig::kStages, + false, // SplitKSerial + typename GemmType::Operator>; + + using WarpIteratorA = typename cutlass::gemm::threadblock:: + DefaultWarpIteratorAFromSharedMemory< + typename DefaultGemm::Mma::Operator::Shape, + typename DefaultGemm::Mma::Operator::InstructionShape, + typename DefaultGemm::Mma::Operator::IteratorA, + typename DefaultGemm::Mma::Policy>::WarpIterator; + using DefaultMmaFromSmem = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MatmulDOIVJ::AccumulatorSharedStorage::Shape::kN, + WarpIteratorA, + false>; // kScaleOperandA + using Mma = typename DefaultMmaFromSmem::Mma; + using IteratorB = typename Mma::IteratorB; + using WarpCount = typename Mma::WarpCount; + + // Epilogue + using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp; + using DefaultEpilogue = typename DefaultGemm::Epilogue; + using OutputTileIterator = + typename cutlass::epilogue::threadblock::MakePrefetchableIterator< + typename DefaultEpilogue::OutputTileIterator>::Iterator; + using AccumTileGmem = GmemTile; + }; + struct MatmulGradK { + // grad_k <- tmp.transpose(-2, -1) @ q_i + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using InstructionShape = typename GemmType::InstructionShape; + + using DefaultGemm = cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + DefaultConfig::kAlignmentA, + scalar_t, // ElementB, + cutlass::layout::RowMajor, // LayoutB, + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + output_t, + cutlass::layout::RowMajor, // LayoutC, + accum_t, + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + typename DefaultConfig::EpilogueOutputOp, + void, // ThreadblockSwizzle - not used + DefaultConfig::kStages, + false, // SplitKSerial + typename GemmType::Operator>; + + using WarpIteratorA = typename cutlass::gemm::threadblock:: + DefaultWarpIteratorAFromSharedMemory< + typename DefaultGemm::Mma::Operator::Shape, + typename DefaultGemm::Mma::Operator::InstructionShape, + typename DefaultGemm::Mma::Operator::IteratorA, + typename DefaultGemm::Mma::Policy>::WarpIterator; + using DefaultMmaFromSmemN = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MatmulQK::AccumulatorSharedStorage::Shape::kN, // kMaxK + WarpIteratorA, + false>; // kScaleOperandA + using DefaultMmaFromSmemT = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MatmulDOIVJ::AccumulatorSharedStorage::Shape::kM, // kMaxK + WarpIteratorA, + false, // kScaleOperandA + kPreload>; // kTransposeA + using DefaultMmaFromSmem = typename cutlass::platform::conditional< + DefaultMmaFromSmemT::kIsTransposedA, + DefaultMmaFromSmemT, + DefaultMmaFromSmemN>::type; + using Mma = typename DefaultMmaFromSmem::Mma; + using IteratorB = typename Mma::IteratorB; + using WarpCount = typename Mma::WarpCount; + + // Epilogue + using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp; + using DefaultEpilogue = typename DefaultGemm::Epilogue; + using OutputTileIterator = + typename cutlass::epilogue::threadblock::MakePrefetchableIterator< + typename DefaultEpilogue::OutputTileIterator>::Iterator; + using AccumTileGmem = GmemTile; + }; + + // NOTE: nvcc 12.4 has correctness errors with this on M60 (sm52) + // when there is an attention bias. Let's just disable it for now. + static constexpr auto kMinSm = ArchTag::kMinComputeCapability; + static constexpr bool kEnableSplitKeys = kMinSm >= 70; + + static constexpr bool kNeedsAccumGradQ = kEnableSplitKeys || + !cutlass::platform::is_same::value; + static constexpr bool kNeedsAccumGradK = !kOutputInRF && + !cutlass::platform::is_same::value; + static constexpr bool kNeedsAccumGradV = !kOutputInRF && + !cutlass::platform::is_same::value; + + struct GradQTempStorage { + int32_t lock; + int32_t counter; + int32_t pad[2]; // pad to 128bits + output_accum_t buffer[MatmulGradQ::AccumTileGmem::kElementsStored]; + }; + + struct Params { + // Input tensors + const scalar_t* query_ptr = nullptr; // [Mq, nH, K] + const scalar_t* key_ptr = nullptr; // [Mk, nH, K] + const scalar_t* value_ptr = nullptr; // [Mk, nH, Kv] + const scalar_t* bias_ptr = nullptr; + const lse_scalar_t* logsumexp_ptr = nullptr; // [nH, Mq] + const scalar_t* output_ptr = nullptr; // [Mq, nH, Kv] + const scalar_t* grad_output_ptr = nullptr; // [Mq, nH, Kv] + accum_t* delta_ptr = nullptr; // [nH, Mq] + const int32_t* cu_seqlens_q_ptr = nullptr; + const int32_t* cu_seqlens_k_ptr = nullptr; + + // Output tensors + output_t* grad_query_ptr = nullptr; // [Mq, nH, K] + output_t* grad_key_ptr = nullptr; // [Mk, nH, K] + output_t* grad_value_ptr = nullptr; // [Mk, nH, Kv] + output_t* grad_bias_ptr = nullptr; + + // Accumulators + output_accum_t* workspace = nullptr; // [Mq, Kq] + [Mkv, Kq] + [Mkv, Kv] + output_accum_t* workspace_gv = + nullptr; // (will be calculated by the kernel) + GradQTempStorage* workspace_gq = + nullptr; // (will be calculated by the kernel) + + // Sliding window. ignored if == 0 + int32_t window_size = 0; + + // Scale + accum_t scale = 1.0f; + + // Dimensions/strides + int32_t head_dim = -1; + int32_t head_dim_value = -1; + int32_t num_queries = -1; + int32_t num_keys = -1; + int32_t num_heads = -1; + uint8_t custom_mask_type = NoCustomMask; + + int64_t q_strideM = -1; + int64_t k_strideM = -1; + int64_t v_strideM = -1; + int64_t bias_strideM = 0; + int64_t gO_strideM = -1; + int64_t gB_strideM = -1; + int8_t gQKV_strideM_multiplier = 1; // 3 for packed, 1 otherwise + + at::PhiloxCudaState rng_engine_inputs = {0, 0}; + + // RNG sequence offset based on batch_id and head_id + unsigned long long dropout_batch_head_rng_offset = 0; + float dropout_prob = 0.0f; + + CUTLASS_HOST_DEVICE int64_t o_strideM() const { + return head_dim_value * num_heads; + } + CUTLASS_HOST_DEVICE int64_t gQ_strideM() const { + return gQKV_strideM_multiplier * num_heads * head_dim; + } + CUTLASS_HOST_DEVICE int64_t gK_strideM() const { + return gQKV_strideM_multiplier * num_heads * head_dim; + } + CUTLASS_HOST_DEVICE int64_t gV_strideM() const { + return gQKV_strideM_multiplier * num_heads * head_dim_value; + } + + // Everything below is only used in `advance_to_block` + // and shouldn't use registers + int64_t o_strideH = -1; + int32_t q_strideH = -1; + int32_t k_strideH = -1; + int32_t v_strideH = -1; + int64_t bias_strideH = 0; + int64_t o_strideB = -1; + int64_t q_strideB = -1; + int64_t k_strideB = -1; + int64_t v_strideB = -1; + int64_t bias_strideB = 0; + int64_t lse_strideB = -1; + int64_t lse_strideH = -1; + int64_t delta_strideB = -1; + int64_t delta_strideH = -1; + int32_t num_batches = -1; + int16_t num_splits_key = 1; // We use `gridDim.x` inside kernel + + int64_t gO_strideB = 0; + int64_t gQ_strideB = 0; + int64_t gK_strideB = 0; + int64_t gV_strideB = 0; + int64_t gB_strideB = 0; + int64_t gO_strideH = 0; + int64_t gQ_strideH = 0; + int64_t gK_strideH = 0; + int64_t gV_strideH = 0; + int64_t gB_strideH = 0; + + CUTLASS_HOST_DEVICE int16_t num_splits_key_device() const { +#ifdef __CUDA_ARCH__ + return kEnableSplitKeys ? gridDim.x : 1; +#else + return num_splits_key; // for host-side tests +#endif + } + CUTLASS_HOST_DEVICE int16_t split_key_device() const { +#ifdef __CUDA_ARCH__ + return kEnableSplitKeys ? blockIdx.x : 0; +#else + return 0; // for host-side tests +#endif + } + + CUTLASS_DEVICE bool advance_to_block() { + int64_t batch_id = blockIdx.z; + int32_t head_id = blockIdx.y; + + if (kNeedsAccumGradQ || kNeedsAccumGradK || kNeedsAccumGradV) { + assert(workspace_size() == 0 || workspace != nullptr); + + workspace += (batch_id * num_heads + head_id) * workspace_strideBH(); + workspace = warp_uniform(workspace); + workspace_gv = workspace + workspace_elements_gk(); + workspace_gq = + (GradQTempStorage*)(workspace_gv + workspace_elements_gv()); + if (kEnableSplitKeys) { + workspace_gv += workspace_elements_gv() * split_key_device() / + num_splits_key_device(); + workspace += workspace_elements_gk() * split_key_device() / + num_splits_key_device(); + } + } else { + workspace = nullptr; + } + + // Advance pointers that depend on the total concatenated + // number of queries, as `num_queries` is modified in the block + // below + dropout_batch_head_rng_offset = + batch_id * (num_heads * num_queries * num_keys) + + head_id * (num_queries * num_keys); + logsumexp_ptr += batch_id * lse_strideB + head_id * lse_strideH; + + if (cu_seqlens_q_ptr != nullptr) { + assert(cu_seqlens_k_ptr != nullptr); + cu_seqlens_q_ptr += batch_id; + cu_seqlens_k_ptr += batch_id; + int32_t q_start = cu_seqlens_q_ptr[0]; + int32_t k_start = cu_seqlens_k_ptr[0]; + int64_t q_next_start = cu_seqlens_q_ptr[1]; + int64_t k_next_start = cu_seqlens_k_ptr[1]; + assert(q_next_start - q_start <= num_queries); + assert(k_next_start - k_start <= num_keys); + num_queries = q_next_start - q_start; + num_keys = k_next_start - k_start; + + // Jump manually + batch_id = 0; + + query_ptr += q_start * q_strideM; + key_ptr += k_start * k_strideM; + value_ptr += k_start * v_strideM; + assert(bias_ptr == nullptr); + assert(grad_bias_ptr == nullptr); + output_ptr += q_start * o_strideM(); + grad_output_ptr += q_start * gO_strideM; + delta_ptr += q_start; + + grad_query_ptr += q_start * gQ_strideM(); + grad_key_ptr += k_start * gK_strideM(); + grad_value_ptr += k_start * gV_strideM(); + } + + query_ptr += batch_id * q_strideB + head_id * q_strideH; + key_ptr += batch_id * k_strideB + head_id * k_strideH; + value_ptr += batch_id * v_strideB + head_id * v_strideH; + if (bias_ptr != nullptr) { + bias_ptr += batch_id * bias_strideB + head_id * bias_strideH; + } + output_ptr += batch_id * o_strideB + head_id * o_strideH; + grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH; + delta_ptr += batch_id * delta_strideB + head_id * delta_strideH; + + grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH; + grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH; + grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH; + if (grad_bias_ptr != nullptr) { + grad_bias_ptr += batch_id * gB_strideB + head_id * gB_strideH; + } + + // Some values are modified above + // Signal to the compiler that they are the same in all threads + // and can be stored in warp-uniform registers (Sm75+) + num_queries = warp_uniform(num_queries); + num_keys = warp_uniform(num_keys); + custom_mask_type = warp_uniform(custom_mask_type); + + query_ptr = warp_uniform(query_ptr); + key_ptr = warp_uniform(key_ptr); + value_ptr = warp_uniform(value_ptr); + bias_ptr = warp_uniform(bias_ptr); + logsumexp_ptr = warp_uniform(logsumexp_ptr); + output_ptr = warp_uniform(output_ptr); + grad_output_ptr = warp_uniform(grad_output_ptr); + delta_ptr = warp_uniform(delta_ptr); + + grad_query_ptr = warp_uniform(grad_query_ptr); + grad_key_ptr = warp_uniform(grad_key_ptr); + grad_value_ptr = warp_uniform(grad_value_ptr); + grad_bias_ptr = warp_uniform(grad_bias_ptr); + +#if 0 + PRINT_T0("[b:%d h:%d] dp[0]:%f Q:%f K:%f V:%f LSE:%f", + int(blockIdx.z), int(blockIdx.y), + float(delta_ptr[0]), + float(query_ptr[0]), float(key_ptr[0]), float(value_ptr[0]), + float(logsumexp_ptr[0]) + ) +#endif + return true; + } + + __host__ dim3 getBlocksGrid() const { + return dim3(num_splits_key, num_heads, num_batches); + } + __host__ dim3 getThreadsGrid() const { + return dim3(kWarpSize * kNumWarpsPerBlock, 1, 1); + } + CUTLASS_HOST_DEVICE int64_t workspace_elements_gk() const { + if (!kNeedsAccumGradK) { + return 0; + } + return num_splits_key * kBlockSizeJ * + align_up(head_dim, kBlockSizeI); + } + CUTLASS_HOST_DEVICE int64_t workspace_elements_gv() const { + if (!kNeedsAccumGradV) { + return 0; + } + return num_splits_key * kBlockSizeJ * + align_up(head_dim_value, kBlockSizeI); + } + CUTLASS_HOST_DEVICE int64_t workspace_elements_gq() const { + if (!kNeedsAccumGradQ) { + return 0; + } + int num_blocks = ceil_div(num_queries, kBlockSizeI); + int num_cols = ceil_div(head_dim, MatmulGradQ::ThreadblockShape::kN); + return num_blocks * num_cols * sizeof(GradQTempStorage) / + sizeof(output_accum_t); + } + CUTLASS_HOST_DEVICE int64_t workspace_strideBH() const { + // Aligned on 128bits + return align_up( + workspace_elements_gk() + workspace_elements_gv() + + workspace_elements_gq(), + int64_t(4)); + } + CUTLASS_HOST_DEVICE int64_t workspace_size() const { + // Returns size of buffer we need to run this kernel + return num_batches * num_heads * workspace_strideBH() * sizeof(float); + } + CUTLASS_HOST_DEVICE bool should_zero_workspace() const { + return num_splits_key > 1 || window_size > 0; + } + }; + + // shared storage for keeping Zij matrix. not needed if we aren't using + // dropout, in which case we use an empty array to save shared memory + using ZijSharedStorage = typename cutlass::platform::conditional< + kApplyDropout, + typename MatmulQK::AccumulatorSharedStorage, + // dummy shared storage object that takes up no space. + typename cutlass::gemm::threadblock::AccumulatorSharedStorage< +#ifdef _WIN32 + // windows builds throw the error: + // "type containing an unknown-size array is not allowed" + // if we try to make Zij shared storage zero-sized. + // To get around this just make it sized 1 on windows. + typename cutlass::gemm::GemmShape<1, 1, 0>, +#else + typename cutlass::gemm::GemmShape<0, 0, 0>, +#endif + typename MatmulQK::AccumulatorSharedStorage::Element, + typename MatmulQK::AccumulatorSharedStorage::Layout, + typename cutlass::MatrixShape<0, 0>>>::type; + + struct SharedStoragePrologue { + struct { + cutlass::Array di; // (do_i * o_i).sum(-1) + typename MatmulQK::Mma::SharedStorageA mm_qk_k; + } persistent; + union { + struct { + // part1 - after Q.K / dV / dO.V + union { + // 1. efficient load of bias tile Bij, which is then applied to Pij + typename MatmulQK::BiasLoader::SmemTile bias; + // 4. store Pij. it is needed: + // - in dVj += (Pij.T * Zij) @ dOi + // - in dSij = Pij * (dPij - Di) + // 6. dVj += (Pij.T * Zij) @ dOi + // 10. write to fragment + typename MatmulQK::AccumulatorSharedStorage attn_shared_storage; + }; + // 5. store Zij. it is needed in dVj += (Pij.T * Zij) @ dOi + ZijSharedStorage zij; + + union { + // 2. prologue for dVj + // 6. workspace for dVj += (Pij.T * Zij) @ dOi + typename MatmulGradV::Mma::SharedStorage mm_gradV; + // 7. dVj epilogue + typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue; + }; + + // 3. prologue for dPij_dropped + // 8. used in dPij_dropped = dOi @ Vj.T + typename MatmulDOIVJ::Mma::SharedStorage mm_doivj; + } part1; + + struct { + // part2 - dQ + union { + typename MatmulQK::AccumulatorSharedStorage + tmpT_shared_storage; // (from part1) + typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage; + }; + typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload) + typename MatmulGradQ::Mma::SharedStorage mm_gradQ; // (preload) + union { + // store dB = dSij to global memory + typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue; + typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue; + }; + + } part2; + + struct { + // part3 - after last iteration on dQ's epilogue / dK + union { + typename MatmulQK::AccumulatorSharedStorage + tmpT_shared_storage; // (from part1) + typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage; + }; + typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload) + typename MatmulGradQ::DefaultEpilogue::SharedStorage + gradQ_epilogue_lastIter; + + typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue; + } part3; + + struct { + // part4 - after last iteration on dK's epilogue / preload next K.Q_t + typename MatmulQK::Mma::SharedStorageB mm_qk_q; + + // If we reach end of current key, dump RF->gmem with "final" epilogues + typename MatmulGradK::DefaultEpilogue::SharedStorage + gradK_epilogue_final; + typename MatmulGradV::DefaultEpilogue::SharedStorage + gradV_epilogue_final; + } part4; + }; + static void print_size() { + // Field size +#define FSZ(f) int((sizeof(((SharedStoragePrologue*)0)->f))) + + printf("Total smem: %d bytes\n", int(sizeof(SharedStoragePrologue))); + printf(" persistent: %db\n", FSZ(persistent)); + printf(" mm_qk_k: %db\n", FSZ(persistent.mm_qk_k)); + printf(" part1: %db\n", FSZ(part1)); + printf(" bias: %db\n", FSZ(part1.bias)); + printf(" attn_shared_storage: %db\n", FSZ(part1.attn_shared_storage)); + printf(" zij: %db\n", FSZ(part1.zij)); + printf(" mm_gradV: %db\n", FSZ(part1.mm_gradV)); + printf(" gradV_epilogue: %db\n", FSZ(part1.gradV_epilogue)); + printf(" mm_doivj: %db\n", FSZ(part1.mm_doivj)); + printf(" part2: %db\n", FSZ(part2)); + printf(" tmpT_shared_storage: %db\n", FSZ(part2.tmpT_shared_storage)); + printf(" tmp_shared_storage: %db\n", FSZ(part2.tmp_shared_storage)); + printf(" mm_gradK: %db\n", FSZ(part2.mm_gradK)); + printf(" mm_gradQ: %db\n", FSZ(part2.mm_gradQ)); + printf(" gradB_epilogue: %db\n", FSZ(part2.gradB_epilogue)); + printf(" gradQ_epilogue: %db\n", FSZ(part2.gradQ_epilogue)); + printf(" part3: %db\n", FSZ(part3)); + printf(" tmpT_shared_storage: %db\n", FSZ(part3.tmpT_shared_storage)); + printf(" part4: %db\n", FSZ(part4)); + printf(" mm_qk_q: %db\n", FSZ(part4.mm_qk_q)); + printf( + " gradK_epilogue_final: %db\n", FSZ(part4.gradK_epilogue_final)); + printf( + " gradV_epilogue_final: %db\n", FSZ(part4.gradV_epilogue_final)); + } +// =========================================== +#define FIELD(INSIDE_STRUCT, FIELDNAME) \ + CUTLASS_DEVICE auto& FIELDNAME() { \ + return INSIDE_STRUCT.FIELDNAME; \ + } + + FIELD(persistent, di) + FIELD(persistent, mm_qk_k) + FIELD(part1, bias) + FIELD(part1, attn_shared_storage) + FIELD(part1, zij) + FIELD(part1, mm_gradV) + FIELD(part1, gradV_epilogue) + FIELD(part1, mm_doivj) + FIELD(part2, mm_gradK) + FIELD(part2, mm_gradQ) + FIELD(part2, gradB_epilogue) + FIELD(part2, gradQ_epilogue) + FIELD(part2, tmp_shared_storage) + FIELD(part3, tmpT_shared_storage) + FIELD(part3, gradQ_epilogue_lastIter) + FIELD(part3, gradK_epilogue) + FIELD(part4, mm_qk_q) + FIELD(part4, gradK_epilogue_final) + FIELD(part4, gradV_epilogue_final) + }; + + struct SharedStorageNoPrologue { + struct { + cutlass::Array di; // (do_i * o_i).sum(-1) + } persistent; + union { + struct { + // part1 - Q.K matmul + typename MatmulQK::Mma::SharedStorageA mm_qk_k; + typename MatmulQK::Mma::SharedStorageB mm_qk_q; + } part1; + + struct { + // part2 - compute gradV + union { + // 1. efficient load of bias tile Bij, which is then applied to Pij + typename MatmulQK::BiasLoader::SmemTile bias; + // 2. store Pij to shared memory. it is needed: + // - in this step, where it is used in dVj += (Pij.T * Zij) @ dOi + // - in next step where it is used in dSij = Pij * (dPij - Di) + typename MatmulQK::AccumulatorSharedStorage attn_shared_storage; + }; + // 3. store Zij. it is needed in this step, where it is used + // to compute Pij_dropped = Pij * Zij on the fly as fragments of Pij are + // loaded for the computation of dVj. + ZijSharedStorage zij; + + union { + typename MatmulGradV::Mma::SharedStorage mm_gradV; + typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue; + }; + } part2; + + struct { + // part3 - DO.V matmul + union { + // first compute dPij = (dOi @ Vj.T) * Zij + // and dSij = Pij * (dPij - Di) + struct { + // (from part2) - Pij for computing dSij = Pij * (dPij - Di) + typename MatmulQK::AccumulatorSharedStorage attn_shared_storage; + // matmul to compute dOiVj + typename MatmulDOIVJ::Mma::SharedStorage mm_doivj; + }; + // then store dB = dSij to global memory + typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue; + }; + } part3; + + struct { + // part4 - compute gradQ + typename MatmulQK::AccumulatorSharedStorage + tmpT_shared_storage; // (from part2) + typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage; + union { + typename MatmulGradQ::Mma::SharedStorage mm_gradQ; + typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue; + typename MatmulGradQ::DefaultEpilogue::SharedStorage + gradQ_epilogue_lastIter; + }; + } part4; + + struct { + // part5 - compute gradK + typename MatmulQK::AccumulatorSharedStorage + tmpT_shared_storage; // (from part2) + typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage; + union { + typename MatmulGradK::Mma::SharedStorage mm_gradK; + typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue; + }; + } part5; + + struct { + // part6 - store RF accumulated into gmem + typename MatmulGradK::DefaultEpilogue::SharedStorage + gradK_epilogue_final; + typename MatmulGradV::DefaultEpilogue::SharedStorage + gradV_epilogue_final; + } part6; + }; + static void print_size() { +#define FIELD_SIZEOF(f) int((sizeof(((SharedStorageNoPrologue*)0)->f))) + printf("Total smem: %d bytes\n", int(sizeof(SharedStorageNoPrologue))); + printf(" persistent: %db\n", FIELD_SIZEOF(persistent)); + printf(" part1: %db\n", FIELD_SIZEOF(part1)); + printf(" part2: %db\n", FIELD_SIZEOF(part2)); + printf(" part3: %db\n", FIELD_SIZEOF(part3)); + printf(" part4: %db\n", FIELD_SIZEOF(part4)); + printf(" part5: %db\n", FIELD_SIZEOF(part5)); + printf(" part6: %db\n", FIELD_SIZEOF(part6)); + } +// =========================================== +#define FIELD(INSIDE_STRUCT, FIELDNAME) \ + CUTLASS_DEVICE auto& FIELDNAME() { \ + return INSIDE_STRUCT.FIELDNAME; \ + } + + FIELD(persistent, di) + FIELD(part1, mm_qk_k) + FIELD(part1, mm_qk_q) + FIELD(part2, bias) + FIELD(part2, attn_shared_storage) + FIELD(part2, zij) + FIELD(part2, mm_gradV) + FIELD(part2, gradV_epilogue) + FIELD(part3, mm_doivj) + FIELD(part3, gradB_epilogue) + FIELD(part4, tmpT_shared_storage) + FIELD(part4, tmp_shared_storage) + FIELD(part4, mm_gradQ) + FIELD(part4, gradQ_epilogue) + FIELD(part4, gradQ_epilogue_lastIter) + FIELD(part5, mm_gradK) + FIELD(part5, gradK_epilogue) + FIELD(part6, gradK_epilogue_final) + FIELD(part6, gradV_epilogue_final) + }; + + using SharedStorage = typename cutlass::platform::conditional< + kPreload, + SharedStoragePrologue, + SharedStorageNoPrologue>::type; + + struct OutputFragments { + typename MatmulGradV::Mma::FragmentC gradV; + typename MatmulGradK::Mma::FragmentC gradK; + + CUTLASS_DEVICE void clear() { + gradV.clear(); + gradK.clear(); + } + }; + + static bool __host__ check_supported(Params const& p) { + CHECK_ALIGNED_PTR(p.query_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.key_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.value_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.bias_ptr, kMinimumAlignment); + TORCH_CHECK( + p.num_heads <= 1 || p.lse_strideH % 8 == 0, + "LSE is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_batches <= 1 || p.lse_strideB % 8 == 0, + "LSE is not correctly aligned (strideB)"); + TORCH_CHECK( + p.num_heads <= 1 || p.q_strideH % kMinimumAlignment == 0, + "query is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_heads <= 1 || p.k_strideH % kMinimumAlignment == 0, + "key is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_heads <= 1 || p.v_strideH % kMinimumAlignment == 0, + "value is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_batches <= 1 || p.q_strideB % kMinimumAlignment == 0, + "query is not correctly aligned (strideB)"); + TORCH_CHECK( + p.num_batches <= 1 || p.k_strideB % kMinimumAlignment == 0, + "key is not correctly aligned (strideB)"); + TORCH_CHECK( + p.num_batches <= 1 || p.v_strideB % kMinimumAlignment == 0, + "value is not correctly aligned (strideB)"); + TORCH_CHECK( + p.q_strideM % kMinimumAlignment == 0, + "query is not correctly aligned (strideM)"); + TORCH_CHECK( + p.k_strideM % kMinimumAlignment == 0, + "key is not correctly aligned (strideM)"); + TORCH_CHECK( + p.v_strideM % kMinimumAlignment == 0, + "value is not correctly aligned (strideM)"); + if (p.bias_ptr) { + TORCH_CHECK( + p.num_batches <= 1 || p.bias_strideB % kMinimumAlignment == 0, + "attn_bias is not correctly aligned (strideB). ", + "attn_bias.stride(0) = ", p.bias_strideB, ", and should be a " + "multiple of ", kMinimumAlignment, "."); + TORCH_CHECK( + p.num_heads <= 1 || p.bias_strideH % kMinimumAlignment == 0, + "attn_bias is not correctly aligned (strideH) ." + "attn_bias.stride(1) = ", p.bias_strideH, ", and should be a " + "multiple of ", kMinimumAlignment, "."); + TORCH_CHECK( + p.num_queries <= 1 || p.bias_strideM % kMinimumAlignment == 0, + "attn_bias is not correctly aligned (strideM). " + "attn_bias.stride(2) = ", p.bias_strideM, ", and should be a ", + "multiple of ", kMinimumAlignment, "."); + } + if (p.grad_bias_ptr) { + TORCH_CHECK( + p.num_batches <= 1 || p.gB_strideB % kMinimumAlignment == 0, + "attn_bias.grad is not correctly aligned (strideB)"); + TORCH_CHECK( + p.num_heads <= 1 || p.gB_strideH % kMinimumAlignment == 0, + "attn_bias.grad is not correctly aligned (strideH)"); + TORCH_CHECK( + p.gB_strideM % kMinimumAlignment == 0, + "attn_bias.grad is not correctly aligned (strideM)"); + } + TORCH_CHECK( + !(p.cu_seqlens_q_ptr && p.bias_ptr), + "CuSeqlen + bias not implemented yet"); + TORCH_CHECK( + p.custom_mask_type < NumCustomMaskTypes, + "Invalid value for `custom_mask_type`"); + TORCH_CHECK( + p.dropout_prob <= 1.0f && p.dropout_prob >= 0.0f, + "Invalid value for `dropout_prob`"); + TORCH_CHECK( + kApplyDropout || p.dropout_prob == 0.0f, + "Set `kApplyDropout`=True to support `dropout_prob > 0`"); + TORCH_CHECK(p.head_dim > 0, "Invalid value for `head_dim`"); + TORCH_CHECK(p.head_dim_value > 0, "Invalid value for `head_dim_value`"); + TORCH_CHECK(p.num_queries > 0, "Invalid value for `num_queries`"); + TORCH_CHECK(p.num_keys > 0, "Invalid value for `num_keys`"); + TORCH_CHECK(p.num_heads > 0, "Invalid value for `num_heads`"); + TORCH_CHECK(p.num_batches > 0, "Invalid value for `num_batches`"); + TORCH_CHECK(p.head_dim <= kMaxK, "kMaxK: Expected `head_dim < kMaxK`"); + TORCH_CHECK( + p.head_dim_value <= kMaxK, "kMaxK: Expected `head_dim_value < kMaxK`"); + if (kKeysQueriesAlignedToBlockSize) { + TORCH_CHECK( + p.cu_seqlens_k_ptr == nullptr, + "This kernel does not support cu_seqlen"); + TORCH_CHECK( + p.cu_seqlens_q_ptr == nullptr, + "This kernel does not support cu_seqlen"); + TORCH_CHECK( + p.num_queries % kBlockSizeI == 0, + "kKeysQueriesAlignedToBlockSize condition not respected"); + TORCH_CHECK( + p.num_keys % kBlockSizeJ == 0, + "kKeysQueriesAlignedToBlockSize condition not respected"); + } + TORCH_CHECK( + kEnableSplitKeys || p.num_splits_key == 1, "SplitKeys is disabled"); + TORCH_CHECK( + p.num_splits_key > 0, "Invalid `num_splits_key` (expected >0)"); + TORCH_CHECK( + p.num_splits_key <= cutlass::ceil_div(p.num_keys, kBlockSizeJ), + "Invalid `num_splits_key` (", + p.num_splits_key, + ") - too large for `num_keys` = ", + p.num_keys); + if (p.window_size != 0) { + TORCH_CHECK( + p.custom_mask_type != NoCustomMask, + "LocalAttention only supported in causal mode"); + } + return true; + } + + static CUTLASS_DEVICE void attention_kernel(Params p) { + extern __shared__ char smem_buffer[]; + SharedStorage& shared_storage = *((SharedStorage*)smem_buffer); + + uint16_t thread_id = threadIdx.x; + uint8_t warp_id = warp_uniform(thread_id / 32); + uint8_t lane_id = thread_id % 32; + + int64_t key_start = p.split_key_device() * kBlockSizeJ; + if (key_start >= p.num_keys) { + return; + } + if (kPrologueQK) { + int64_t query_start = getQueryStart(p, key_start); + prologueQkNextIteration( + shared_storage, p, query_start, key_start, warp_id, lane_id); + } + + // Computes (dO*out).sum(-1) and writes it to `p.delta_ptr` + if (kKernelComputesDelta) { + constexpr int kOptimalElements = + 128 / cutlass::sizeof_bits::value; + if (p.head_dim_value % kOptimalElements == 0) { + for (int query_start = 0; query_start < p.num_queries; + query_start += kBlockSizeI) { + computeDelta(p, query_start, warp_id, lane_id); + } + } else { + for (int query_start = 0; query_start < p.num_queries; + query_start += kBlockSizeI) { + computeDelta<1>(p, query_start, warp_id, lane_id); + } + } + __syncthreads(); + } + + OutputFragments output_frags; + + curandStatePhilox4_32_10_t rng_state_init; + + if (kApplyDropout) { + // See Note [Seed and Offset Device] + auto const [seed, offset] = at::cuda::philox::unpack(p.rng_engine_inputs); + // each element of the attention matrix P with shape + // (batch_sz, n_heads, n_queries, n_keys) is associated with a single + // offset in RNG sequence. we initialize the RNG state with offset that + // starts at the beginning of a (n_queries, n_keys) matrix for this + // block's batch_id and head_id + // initializing rng state is very expensive, so we run once per kernel, + // rather than once per iteration. each iteration takes a copy of the + // initialized RNG state and offsets it as needed. + curand_init( + seed, + 0, + offset + p.dropout_batch_head_rng_offset, + &rng_state_init); + } + + CUTLASS_PRAGMA_UNROLL + for (; key_start < p.num_keys; + key_start += p.num_splits_key_device() * kBlockSizeJ) { + output_frags.clear(); + + int64_t next_key = key_start; + int64_t query_start = getQueryStart(p, key_start); + while (next_key == key_start && query_start < p.num_queries) { + // This line here + // vvvvvvvvvvvvvv + warp_id = warp_uniform(warp_id); + // ^^^^^^^^^^^^^^ + // ... makes everything use less RF and be 10% faster. Why? + // I don't know. My theory is that it forces `nvcc` to + // re-compute indices, offsets etc... and not keep them + // from the previous iteration, which prevents MASSIVE + // register spilling. + + processBlockIJ( + shared_storage, + output_frags, + p, + query_start, + key_start, + rng_state_init, + warp_id, + lane_id); + + int64_t next_query; + incrIteration(p, query_start, key_start, next_query, next_key); + query_start = next_query; + } + if (kOutputInRF) { + writeFragsToGmem( + shared_storage, output_frags, p, key_start, warp_id, lane_id); + } else if (getQueryStart(p, key_start) >= p.num_queries) { + zfillGradKV( + p, key_start, warp_id, lane_id); + } + __syncthreads(); + } + } + + template + static CUTLASS_DEVICE void zfillGradKV( + Params const& p, + int32_t key_start, + uint8_t warp_id, + uint8_t lane_id) { + constexpr int kThreadsPerKey = 8; + constexpr int kParallelKeys = kNumThreads / kThreadsPerKey; + static_assert(kBlockSizeJ % kParallelKeys == 0, ""); + // This function is not really optimized, but should rarely be used + // It's only used when some keys are "useless" and don't attend to + // any query, due to causal masking + + int thread_id = 32 * warp_id + lane_id; + int k_shift = lane_id % kThreadsPerKey; + + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < kBlockSizeJ; j += kParallelKeys) { + int key = key_start + j + (thread_id / kThreadsPerKey); + if (!skipBoundsChecks && key >= p.num_keys) { + continue; + } + auto gv_ptr = p.grad_value_ptr + key * p.gV_strideM(); + auto gk_ptr = p.grad_key_ptr + key * p.gK_strideM(); + + for (int k = k_shift; k < p.head_dim_value; k += kThreadsPerKey) { + gv_ptr[k] = scalar_t(0); + } + for (int k = k_shift; k < p.head_dim; k += kThreadsPerKey) { + gk_ptr[k] = scalar_t(0); + } + } + } + + template + static CUTLASS_DEVICE void processBlockIJ( + SharedStorage& shared_storage, + OutputFragments& output_frags, + Params& p, + int64_t query_start, + int64_t key_start, + const curandStatePhilox4_32_10_t& curand_state_init, + uint8_t warp_id, + uint8_t lane_id) { + cutlass::Array + dropout_keep_mask_doivj; + dropout_keep_mask_doivj.fill(cutlass::uint1b_t{1}); + const float dropout_scale = + kApplyDropout ? 1.0 / (1.0 - p.dropout_prob) : 1.0f; + + cutlass::MatrixCoord no_offset{0, 0}; + accum_t scale = p.scale; + int16_t thread_id = 32 * warp_id + lane_id; + + auto rematerializeThreadIds = [&]() { + // Prevents `nvcc` from keeping values deduced from + // `thread_id`, `warp_id`, ... in RF - to reduce register pressure + warp_id = warp_uniform(thread_id / 32); + lane_id = thread_id % 32; + thread_id = 32 * warp_id + lane_id; + }; + + bool isFirstQuery = (query_start == getQueryStart(p, key_start)); + int64_t next_query, next_key; + incrIteration(p, query_start, key_start, next_query, next_key); + bool isLastQuery = next_key != key_start; + + accum_t di_rf = accum_t(0); + if (thread_id < kBlockSizeI) { + if (query_start + thread_id < p.num_queries) { + di_rf = p.delta_ptr[query_start + thread_id]; + } + shared_storage.di()[thread_id] = di_rf; + } + + int32_t num_queries_in_block = skipBoundsChecks + ? MatmulQK::Mma::Shape::kN + : warp_uniform(cutlass::fast_min( + MatmulQK::Mma::Shape::kN, (int32_t)(p.num_queries - query_start))); + int32_t num_keys_in_block = skipBoundsChecks + ? MatmulQK::Mma::Shape::kM + : warp_uniform(cutlass::fast_min( + MatmulQK::Mma::Shape::kM, (int32_t)(p.num_keys - key_start))); + + auto prologueGradV = [&](int64_t col) { + typename MatmulGradV::Mma::IteratorB iterator_dO( + {int32_t(p.gO_strideM)}, + const_cast(p.grad_output_ptr + query_start * p.gO_strideM + col), + {num_queries_in_block, (int32_t)(p.head_dim_value - col)}, + thread_id, + no_offset); + MatmulGradV::Mma::prologue( + shared_storage.mm_gradV(), + iterator_dO, + thread_id, + num_queries_in_block); + }; + auto prologueGradQ = [&](int col) { + typename MatmulGradQ::Mma::IteratorB iterator_K( + {int32_t(p.k_strideM)}, + const_cast(p.key_ptr + key_start * p.k_strideM + col), + {num_keys_in_block, p.head_dim - col}, + thread_id, + no_offset); + MatmulGradQ::Mma::prologue( + shared_storage.mm_gradQ(), iterator_K, thread_id, num_keys_in_block); + }; + auto prologueGradK = [&](int col) { + typename MatmulGradK::Mma::IteratorB iterator_Q( + {int32_t(p.q_strideM)}, + const_cast(p.query_ptr + query_start * p.q_strideM + col), + {num_queries_in_block, p.head_dim - col}, + thread_id, + no_offset); + MatmulGradK::Mma::prologue( + shared_storage.mm_gradK(), + iterator_Q, + thread_id, + num_queries_in_block); + }; + auto prologueDOV = [&]() { + typename MatmulDOIVJ::Mma::IteratorA iterator_A( + {int32_t(p.gO_strideM)}, + const_cast(p.grad_output_ptr + query_start * p.gO_strideM), + {num_queries_in_block, p.head_dim_value}, + thread_id, + no_offset); + typename MatmulDOIVJ::Mma::IteratorB iterator_B( + {int32_t(p.v_strideM)}, + const_cast(p.value_ptr + key_start * p.v_strideM), + {p.head_dim_value, num_keys_in_block}, + thread_id, + no_offset); + MatmulDOIVJ::Mma::prologue( + shared_storage.mm_doivj(), + iterator_A, + iterator_B, + thread_id, + p.head_dim_value); + }; + + ///////////////////////////////////////////////////////////////////////////////////////////////// + // MatmulQK + ///////////////////////////////////////////////////////////////////////////////////////////////// + { + using Mma = typename MatmulQK::Mma; + + cutlass::gemm::GemmCoord problem_size( + num_keys_in_block, + num_queries_in_block, + p.head_dim // k + ); + + // k_j + typename Mma::IteratorA iterator_A( + {int32_t(p.k_strideM)}, + const_cast(p.key_ptr + key_start * p.k_strideM), + {problem_size.m(), problem_size.k()}, + thread_id, + no_offset); + + // q_i.transpose(-2, -1) + typename Mma::IteratorB iterator_B( + {int32_t(p.q_strideM)}, + const_cast(p.query_ptr + query_start * p.q_strideM), + {problem_size.k(), problem_size.n()}, + thread_id, + no_offset); + + Mma mma( + shared_storage.mm_qk_k(), + shared_storage.mm_qk_q(), + thread_id, + warp_id, + lane_id); + + typename Mma::FragmentC accum; + + accum.clear(); + + auto gemm_k_iterations = + (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + mma.set_prologue_done(kPrologueQK); + mma.set_zero_outside_bounds(!skipBoundsChecks); + mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum); + accum = cutlass::multiplies()(scale, accum); + + // Epilogue: add LSE + exp and store that to our shared memory buffer + // shmem <- (matmul_result - + // logsumexp[i_start:i_end].unsqueeze(1)).exp() + int warp_idx_mn_0 = + warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN); + auto output_tile_coords = cutlass::MatrixCoord{ + warp_idx_mn_0 % Mma::Base::WarpCount::kM, + warp_idx_mn_0 / Mma::Base::WarpCount::kM}; + + // apply bias if applicable + if (p.bias_ptr != nullptr) { + // load bias tile Bij into shared memory + typename MatmulQK::BiasLoader::GmemTileIterator bias_iter( + {cutlass::layout::RowMajor(p.bias_strideM)}, + const_cast(p.bias_ptr + query_start * p.bias_strideM + key_start), + {num_queries_in_block, num_keys_in_block}, + thread_id); + cutlass::TensorRef bias_tensor_ref( + shared_storage.bias().data(), + cutlass::layout::RowMajor(MatmulQK::ThreadblockShape::kM)); + typename MatmulQK::BiasLoader::SmemTileIterator smem_tile_iter( + bias_tensor_ref, thread_id); + MatmulQK::BiasLoader::load(bias_iter, smem_tile_iter); + + // Pij += Bij, where Pij is in register fragment and Bij is in shmem + auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords); + MatmulQK::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_n) {}, + [&](int accum_m, int accum_n, int idx) { + // remember we are transposed + accum[idx] += bias_tensor_ref.at({accum_n, accum_m}); + }, + [&](int accum_n) {}); + } + + // Apply mask + if (p.custom_mask_type == CausalFromTopLeft || + p.custom_mask_type == CausalFromBottomRight) { + auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords); + int shift = query_start - key_start; + if (p.custom_mask_type == CausalFromBottomRight) { + shift += p.num_keys - p.num_queries; + } + // current_key = key_start + accum_m + // current_query = query_start + accum_n + // mask if: `current_key > current_query` + MatmulQK::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (accum_m > accum_n + shift) { + accum[idx] = + -cutlass::platform::numeric_limits::infinity(); + } + }, + [&](int accum_m) {}); + } + if (p.window_size > 0) { + auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords); + int shift = query_start - key_start - p.window_size; + // current_key = key_start + accum_m + // current_query = query_start + accum_n + // mask if: `current_key < current_query - window_size` + // if accum_m < accum_n + query_start - window_size - key_start + + MatmulQK::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (accum_m <= accum_n + shift) { + accum[idx] = + -cutlass::platform::numeric_limits::infinity(); + } + }, + [&](int accum_m) {}); + } + __syncthreads(); + if (kPrologueGV) { + prologueGradV(0); + } + if (kPrologueDOV) { + prologueDOV(); + } + + MatmulQK::B2bGemm::accumApplyLSEToSmem( + shared_storage.attn_shared_storage(), + accum, + p.logsumexp_ptr + query_start, + problem_size.n(), + thread_id, + warp_id, + lane_id, + output_tile_coords); +#if 0 + auto accum_ref_attnT = shared_storage.attn_shared_storage().accum_ref(); + PRINT_TENSOR4x4_T0_L0("attn_T", accum_ref_attnT); +#endif + + // if we are using dropout, compute Zij, writing it to shared memory. + // each element of Zij is: + // - 0 with probability dropout_p + // - 1 / (1 - dropout_p) with probability 1 - dropout_p + if (kApplyDropout) { + auto zij = shared_storage.zij().accum_ref(); + // each thread generates a contiguous sequence of elements in Zij, all + // in the same row. the reason they have to come from the same row is + // that sampling random numbers from a contiguous random number sequence + // is much more efficient than jumping around, and the linear offset of + // each element of Z (the global matrix) maps to an offset in a random + // number sequence. for Z, the end of a row and the beginning of the + // next have adjacent offsets, but for Zij (tile of global matrix), this + // is not necessarily the case. + // We must fill the entire `zij` shmem with values (even out of bounds + // on the K-dimension) otherwise we can get NaNs during the GEMM + const int kQueriesPerBlock = kBlockSizeI; + const int threads_per_row = cutlass::fast_min( + kNumThreads / kQueriesPerBlock, (int64_t)num_keys_in_block); + const int elts_per_thread = cutlass::round_nearest( + cutlass::ceil_div(num_keys_in_block, threads_per_row), 4); + + const int thread_i = thread_id / threads_per_row; + const int thread_start_j = + (thread_id % threads_per_row) * elts_per_thread; + + if (thread_i < kQueriesPerBlock && thread_start_j < num_keys_in_block) { + curandStatePhilox4_32_10_t curand_state = curand_state_init; + skipahead( + (query_start + thread_i) * p.num_keys + + (key_start + thread_start_j), + &curand_state); + + // generate elements of Zij, 4 elements at a time + for (int zij_start_col_idx = thread_start_j; zij_start_col_idx < + cutlass::fast_min(thread_start_j + elts_per_thread, + num_keys_in_block); + zij_start_col_idx += 4) { + const float4 rand_uniform_quad = curand_uniform4(&curand_state); + + CUTLASS_PRAGMA_UNROLL + for (int quad_idx = 0; quad_idx < 4; ++quad_idx) { + // we'll write Zij transposed since attention is also transposed + // during the matmul to compute dV. + zij.at({zij_start_col_idx + quad_idx /*k*/, thread_i /*q*/}) = + (&rand_uniform_quad.x)[quad_idx] > p.dropout_prob + ? scalar_t(dropout_scale) + : scalar_t(0); + } + } + } + __syncthreads(); +#if 0 + PRINT_TENSOR4x4_T0_L0("zij", zij); + PRINT_TENSOR4x4_T0_L0_START("zij", zij, kBlockSizeJ - 4, kBlockSizeI - 4); +#endif + + // Save mask for later DOIVJ matmul + + int warp_idx_mn_0 = warp_id % + (MatmulDOIVJ::Mma::Base::WarpCount::kM * + MatmulDOIVJ::Mma::Base::WarpCount::kN); + auto output_tile_coords_doivj = cutlass::MatrixCoord{ + warp_idx_mn_0 % MatmulDOIVJ::Mma::Base::WarpCount::kM, + warp_idx_mn_0 / MatmulDOIVJ::Mma::Base::WarpCount::kM}; + auto lane_offset = MatmulDOIVJ::AccumLambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords_doivj); + MatmulDOIVJ::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m /*q*/, int accum_n /*k*/, int idx) { + if (zij.at({accum_n, accum_m}) == scalar_t(0)) { + dropout_keep_mask_doivj[idx] = cutlass::uint1b_t{0}; + } + }, + [&](int accum_m) {}); + } + __syncthreads(); + } + rematerializeThreadIds(); + + ///////////////////////////////////////////////////////////////////////////////////////////////// + // GradV matmul + // + // grad_v[j_start:j_end] += attn_T @ do_i + ///////////////////////////////////////////////////////////////////////////////////////////////// + constexpr bool kSingleIterationGradV = + kMaxK <= MatmulGradV::ThreadblockShape::kN; + for (int32_t col = 0; col < (kSingleIterationGradV ? 1 : p.head_dim_value); + col += MatmulGradV::ThreadblockShape::kN) { + using Mma = typename MatmulGradV::Mma; + using AccumTileGmem = typename MatmulGradQ::AccumTileGmem; + + cutlass::gemm::GemmCoord problem_size( + num_keys_in_block, p.head_dim_value - col, num_queries_in_block); + auto createEpilogueIter = [&]() { + return typename MatmulGradV::OutputTileIterator( + typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()}, + p.grad_value_ptr + key_start * p.gV_strideM() + col, + {num_keys_in_block, p.head_dim_value - col}, + thread_id); + }; + typename Mma::IteratorB iterator_B( + {int32_t(p.gO_strideM)}, + const_cast(p.grad_output_ptr + query_start * p.gO_strideM + col), + {num_queries_in_block, p.head_dim_value - col}, + thread_id, + no_offset); + + // if dropout: dVj += (Pij.T * Zij) @ dOi + // otherwise: dVj += Pij.T @ dOi + Mma mma( + // operand A: Pij.T + shared_storage.attn_shared_storage().accum_ref(), + // operand A_scale Zij.T: + // if we're using dropout, operand A is Pij_dropped.T = Pij.T * Zij.T + // which is computed on the fly as fragments of Pij.T are loaded in + shared_storage.zij().accum_ref(), + // operand B: dOi - which was loaded into shared memory previously + // when we computed dVj + shared_storage.mm_gradV().operand_B_ref(), + thread_id, + warp_id, + lane_id); + + int storage_id = col / MatmulGradV::ThreadblockShape::kN; + AccumTileGmem gmem_tile{ + p.workspace_gv + storage_id * AccumTileGmem::kElementsStored}; + if (!kOutputInRF) { + if (isFirstQuery || !kNeedsAccumGradV) { + output_frags.gradV.clear(); + } else { + gmem_tile.load(output_frags.gradV, thread_id); + } + } + mma.set_prologue_done(kPrologueGV); + + auto gemm_k_iterations = + (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + __syncthreads(); + + mma(gemm_k_iterations, + output_frags.gradV, + iterator_B, + output_frags.gradV); + __syncthreads(); + if (kPrologueGV && !kSingleIterationGradV && + col + MatmulGradV::ThreadblockShape::kN < p.head_dim_value) { + prologueGradV(col + MatmulGradV::ThreadblockShape::kN); + } + + if (!kOutputInRF) { + if (kNeedsAccumGradV && !isLastQuery) { + gmem_tile.store(output_frags.gradV, thread_id); + } else { + accumulateInGmem( + shared_storage.gradV_epilogue(), + output_frags.gradV, + createEpilogueIter(), + isFirstQuery || kNeedsAccumGradV, + warp_id, + lane_id); + } + } + } + __syncthreads(); + + ///////////////////////////////////////////////////////////////////////////////////////////////// + // MatmulDOIVJ + ///////////////////////////////////////////////////////////////////////////////////////////////// + { + using Mma = typename MatmulDOIVJ::Mma; + // do_i + typename Mma::IteratorA iterator_A( + {int32_t(p.gO_strideM)}, + const_cast(p.grad_output_ptr + query_start * p.gO_strideM), + {num_queries_in_block, p.head_dim_value}, + thread_id, + no_offset); + + // v_j.transpose(-2, -1) + typename Mma::IteratorB iterator_B( + {int32_t(p.v_strideM)}, + const_cast(p.value_ptr + key_start * p.v_strideM), + {p.head_dim_value, num_keys_in_block}, + thread_id, + no_offset); + + Mma mma(shared_storage.mm_doivj(), thread_id, warp_id, lane_id); + mma.set_prologue_done(kPrologueDOV); + mma.set_zero_outside_bounds(!skipBoundsChecks); + + typename Mma::FragmentC accum; + + accum.clear(); + + auto gemm_k_iterations = + (p.head_dim_value + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum); + __syncthreads(); + if (kPrologueGQ) { + prologueGradQ(0); + } + if (kPrologueGK) { + prologueGradK(0); + } + + int warp_idx_mn_0 = + warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN); + auto output_tile_coords = cutlass::MatrixCoord{ + warp_idx_mn_0 % Mma::Base::WarpCount::kM, + warp_idx_mn_0 / Mma::Base::WarpCount::kM}; + // TODO: This must be terribly inefficient. There must be a better way + // tmp [RF] <- (accum [RF] - Di [smem] ) * attn_T.T [smem] + // attn_shared_storage [smem] <- tmp.T + // tmp_shared_storage [smem] <- tmp + { + using LambdaIterator = typename MatmulDOIVJ::AccumLambdaIterator; + auto lane_offset = LambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords); + // if dropout was used, compute dPij = dPij_dropped * Zij + if (kApplyDropout) { + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (dropout_keep_mask_doivj[idx].get()) { + accum[idx] *= dropout_scale; + } else { + accum[idx] = 0; + } + }, + [&](int accum_m) {}); + } + + auto attn_T = shared_storage.attn_shared_storage().accum_ref(); +#if 0 + PRINT_B0_T0("doivj_dropped"); + print_warp_accum(accum, lane_offset, 4, 4); + PRINT_TENSOR4x4_T0_L0("attn_T", attn_T) +#endif + accum_t current_di; + // dSij = (dPij - Di) * Pij + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { current_di = shared_storage.di()[accum_m]; }, + [&](int accum_m, int accum_n, int idx) { + // TODO: Otherwise we can get nans as we + // might have infs here (only seen on f16 tho) + if (skipBoundsChecks || + (accum_m < num_queries_in_block && + accum_n < num_keys_in_block)) { + accum_t attn = attn_T.at({accum_n, accum_m}); + accum[idx] = (accum[idx] - current_di) * attn; + } else { + accum[idx] = 0; + } + }, + [&](int accum_m) { + + }); + + // store bias gradient tile dBij to global memory, + // where dBij = dSij = Pij * (dPij - Di) + if (p.grad_bias_ptr != nullptr) { + typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator + output_iter( + typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator:: + Params{p.gB_strideM}, + // grad_bias_ptr is offset to point at beginning of + // matrix of shape (queries, keys) for a given + // (batch_id, head_id) the pointer arithmetic here produces + // a pointer to the start of the current tile within that + // matrix + p.grad_bias_ptr + query_start * p.gB_strideM + key_start, + {num_queries_in_block, num_keys_in_block}, + thread_id); + + // no-op epilogue operator - just casting and storing contents of + // accum to global memory + typename MatmulDOIVJ::BiasGradEpilogue::OutputOp output_op({1, 1}); + typename MatmulDOIVJ::BiasGradEpilogue epilogue( + shared_storage.gradB_epilogue(), thread_id, warp_id, lane_id); + epilogue(output_op, output_iter, accum, output_iter); + } + + accum = accum * scale; + +#if 0 + PRINT_B0_T0("(doivj - di) * attn * scale"); + print_warp_accum(accum, lane_offset, 4, 4); +#endif + + __syncthreads(); + if (!MatmulGradK::DefaultMmaFromSmem::kIsTransposedA) { + auto tmpT = shared_storage.tmpT_shared_storage().accum_ref(); + // attn <- attn_T.T + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + tmpT.at({accum_n, accum_m}) = scalar_t(accum[idx]); + }, + [&](int accum_m) {}); + } + } + + MatmulDOIVJ::B2bGemm::accumToSmem( + shared_storage.tmp_shared_storage(), + accum, + lane_id, + output_tile_coords); + __syncthreads(); + } + // Force `nvcc` to recompute values that depend on the variables just below + // to use less RF and prevent some spilling + p.head_dim = warp_uniform(p.head_dim); + p.k_strideM = warp_uniform(p.k_strideM); + rematerializeThreadIds(); + + ///////////////////////////////////////////////////////////////////////////////////////////////// + // GradQ matmul + // + // grad_q[i_start:i_end] += tmp @ k_j + ///////////////////////////////////////////////////////////////////////////////////////////////// + // Skip the loop & associated branches if we know at compile time the number + // of iterations + constexpr bool kSingleIterationGradQ = + kMaxK <= MatmulGradQ::ThreadblockShape::kN; + for (int col = 0; col < (kSingleIterationGradQ ? 1 : p.head_dim); + col += MatmulGradQ::ThreadblockShape::kN) { + using Mma = typename MatmulGradQ::Mma; + using AccumTileGmem = typename MatmulGradQ::AccumTileGmem; + + cutlass::gemm::GemmCoord problem_size( + num_queries_in_block, + false ? MatmulGradQ::ThreadblockShape::kN : p.head_dim - col, + num_keys_in_block); + + // k_j + typename Mma::IteratorB iterator_B( + {int32_t(p.k_strideM)}, + const_cast(p.key_ptr + key_start * p.k_strideM + col), + {problem_size.k(), problem_size.n()}, + thread_id, + no_offset); + + auto a = shared_storage.tmp_shared_storage().accum_ref(); + Mma mma( + // operand A: dSij + shared_storage.tmp_shared_storage().accum_ref(), + // operand B: Kj + shared_storage.mm_gradQ().operand_B_ref(), + thread_id, + warp_id, + lane_id); + + typename Mma::FragmentC accum; + + int col_id = col / MatmulGradQ::ThreadblockShape::kN; + int num_cols = kSingleIterationGradQ + ? 1 + : ceil_div(p.head_dim, MatmulGradQ::ThreadblockShape::kN); + int storage_id = (col_id + query_start / kBlockSizeI * num_cols); + + if (p.num_splits_key_device() > 1) { + AtomicLock::acquire( + &p.workspace_gq[storage_id].lock, + p.split_key_device() + 1, + thread_id); + // Make sure we can see other block's output + __threadfence(); + } + + AccumTileGmem gmem_tile{&p.workspace_gq[storage_id].buffer[0]}; + if (!kNeedsAccumGradQ || + (p.num_splits_key_device() == 1 && key_start == 0)) { + // if we know we are the first to access it, we know it's only zeros. + // Avoids a load from gmem (and gmem init as well) + accum.clear(); + } else { + gmem_tile.load(accum, thread_id); + } + + auto gemm_k_iterations = + (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + __syncthreads(); + mma.set_prologue_done(kPrologueGQ); + mma(gemm_k_iterations, accum, iterator_B, accum); + __syncthreads(); + bool isLastColumn = kSingleIterationGradQ || + (col + MatmulGradQ::ThreadblockShape::kN >= p.head_dim); + if (kPrologueGQ && !isLastColumn) { + prologueGradQ(col + MatmulGradQ::ThreadblockShape::kN); + } + + bool isLast = [&]() { + int32_t next_key = key_start + p.num_splits_key_device() * kBlockSizeJ; + if (p.num_keys <= next_key) { + return true; + } + if (query_start < getSmallestQueryForKey(p, next_key)) { + return true; + } + return false; + }(); + // Output results + if (p.num_splits_key_device() > 1) { + int32_t numAddsSoFar = -1; + if (isLast && thread_id == 0) { + numAddsSoFar = atomicAdd(&p.workspace_gq[storage_id].counter, 1) + + 1; // `atomicAdd` returns the old value + } + isLast = __syncthreads_or( + numAddsSoFar == getNumParallelBlocksForQuery(p, query_start)); + assert(numAddsSoFar <= getNumParallelBlocksForQuery(p, query_start)); + } + if (kNeedsAccumGradQ && !isLast) { + gmem_tile.store(accum, thread_id); + if (p.num_splits_key_device() > 1) { + // Make sure everyone wrote before we release the lock + __threadfence(); + __syncthreads(); + AtomicLock::release(&p.workspace_gq[storage_id].lock, thread_id); + } + } else { + // NOTE: We're not releasing the lock because no one is expected + // to come after us (we're the last one to write) + typename MatmulGradQ::OutputTileIterator output_it( + typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()}, + p.grad_query_ptr + query_start * p.gQ_strideM() + col, + {problem_size.m(), problem_size.n()}, + thread_id); + // if `direct_store` is True, we store to gmem (`*gmem = accum`) + // otherwise, we accumulate in gmem (`*gmem = *gmem + accum`) + // If we know ahead of time when we will write for the first time + // we can: + // (1) Avoid an additional memory read + // (2) Avoid the cost of initializing memory to 0 + bool direct_store = kNeedsAccumGradQ || key_start == 0 || + (p.num_splits_key_device() > 1); + accumulateInGmem( + isLastColumn ? shared_storage.gradQ_epilogue_lastIter() + : shared_storage.gradQ_epilogue(), + accum, + output_it, + direct_store, + warp_id, + lane_id); + } + } + ///////////////////////////////////////////////////////////////////////////////////////////////// + // GradK matmul + // + // grad_k[i_start:i_end] += tmp.transpose(-2, -1) @ q_i + ///////////////////////////////////////////////////////////////////////////////////////////////// + rematerializeThreadIds(); + + constexpr bool kSingleIterationGradK = + kMaxK <= MatmulGradK::ThreadblockShape::kN; + for (int col = 0; col < (kSingleIterationGradK ? 1 : p.head_dim); + col += MatmulGradK::ThreadblockShape::kN) { + using Mma = typename MatmulGradK::Mma; + using AccumTileGmem = typename MatmulGradQ::AccumTileGmem; + + cutlass::gemm::GemmCoord problem_size( + num_keys_in_block, + false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col, + num_queries_in_block); + auto createEpilogueIter = [&]() { + return typename MatmulGradK::OutputTileIterator( + typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()}, + p.grad_key_ptr + key_start * p.gK_strideM() + col, + {num_keys_in_block, + false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col}, + thread_id); + }; + + // q_i + typename Mma::IteratorB iterator_B( + {int32_t(p.q_strideM)}, + const_cast(p.query_ptr + query_start * p.q_strideM + col), + {problem_size.k(), problem_size.n()}, + thread_id, + no_offset); + + auto getTmp = [&](int) { return &shared_storage.tmp_shared_storage(); }; + auto getTmpT = [&](int) { return &shared_storage.tmpT_shared_storage(); }; + // this is basically: + // opA = kIsTransposedA ? getTmp() : getTmpT(); + bool constexpr kIsTransposedA = + MatmulGradK::DefaultMmaFromSmem::kIsTransposedA; + auto& opA = *call_conditional< + kIsTransposedA, + decltype(getTmp), + decltype(getTmpT)>::apply(getTmp, getTmpT, 0); + Mma mma( + // operand A: dSij.T + opA.accum_ref(), + // operand B: Qi + shared_storage.mm_gradK().operand_B_ref(), + thread_id, + warp_id, + lane_id); + + int storage_id = col / MatmulGradK::ThreadblockShape::kN; + AccumTileGmem gmem_tile{ + p.workspace + storage_id * AccumTileGmem::kElementsStored}; + if (!kOutputInRF) { + if (isFirstQuery || !kNeedsAccumGradK) { + output_frags.gradK.clear(); + } else { + gmem_tile.load(output_frags.gradK, thread_id); + } + } + mma.set_prologue_done(kPrologueGK); + + auto gemm_k_iterations = + (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + __syncthreads(); + + mma(gemm_k_iterations, + output_frags.gradK, + iterator_B, + output_frags.gradK); + __syncthreads(); + bool isLastColumn = kSingleIterationGradK || + col + MatmulGradK::ThreadblockShape::kN >= p.head_dim; + if (kPrologueGK && !isLastColumn) { + prologueGradK(col + MatmulGradK::ThreadblockShape::kN); + } + + if (kPrologueQK && isLastColumn) { + int64_t next_query, next_key; + incrIteration(p, query_start, key_start, next_query, next_key); + DISPATCH_BOOL( + next_key != key_start, kForceReloadK, ([&]() { + prologueQkNextIteration( + shared_storage, p, next_query, next_key, warp_id, lane_id); + })); + } + + // Output results + if (!kOutputInRF) { + if (kNeedsAccumGradK && !isLastQuery) { + gmem_tile.store(output_frags.gradK, thread_id); + } else { + accumulateInGmem( + isLastColumn ? shared_storage.gradK_epilogue_final() + : shared_storage.gradK_epilogue(), + output_frags.gradK, + createEpilogueIter(), + isFirstQuery || kNeedsAccumGradK, + warp_id, + lane_id); + __syncthreads(); + } + } + } + } + + static CUTLASS_HOST_DEVICE int64_t getQueryStartShift(Params const& p) { + if (p.custom_mask_type == NoCustomMask && p.num_splits_key_device() > 1) { + return (p.split_key_device() * kBlockSizeI) % getQueryEnd(p); + } + return 0; + } + + // Iteration order logic + static CUTLASS_HOST_DEVICE int64_t + getQueryStart(Params const& p, int64_t key_start) { + return getSmallestQueryForKey(p, key_start) + getQueryStartShift(p); + }; + static CUTLASS_HOST_DEVICE int64_t getQueryEnd(Params const& p) { + return align_up(p.num_queries, kBlockSizeI); + }; + + static CUTLASS_HOST_DEVICE int64_t + getSmallestQueryForKey(Params const& p, int64_t key_start) { + if (p.custom_mask_type == NoCustomMask) { + return 0; + } + int64_t shift = p.custom_mask_type == CausalFromBottomRight + ? p.num_keys - p.num_queries + : 0; + int64_t window_size = + p.window_size == 0 ? p.num_queries + p.num_keys : p.window_size; + + auto last_key_for_block = + cutlass::fast_min(key_start + kBlockSizeJ, (int64_t)p.num_keys) - 1; + int first_query = key_start - shift; + int last_query = last_key_for_block - shift + window_size - 1; + if (last_query < 0 || first_query >= p.num_queries) { + return getQueryEnd(p); // nothing to compute in this column + } + first_query = cutlass::fast_max(0, first_query); + return (first_query / kBlockSizeI) * kBlockSizeI; + }; + + // Returns how many kernel blocks will write to a given block in `grad_query` + // This is usually equal to the number of key splits, but can be different + // for instance in the causal case, or varying seqlen + static CUTLASS_HOST_DEVICE int32_t + getNumParallelBlocksForQuery(Params const& p, int32_t query_start) { + int16_t num_key_blocks = ceil_div(p.num_keys, kBlockSizeJ); + if (p.custom_mask_type != NoCustomMask) { + int32_t shift = p.custom_mask_type == CausalFromBottomRight + ? p.num_keys - p.num_queries + : 0; + int32_t last_query_for_block = + cutlass::fast_min(query_start + kBlockSizeI, p.num_queries) - 1; + int32_t last_key_for_block = + cutlass::fast_min(last_query_for_block + shift, p.num_keys - 1); + int32_t first_key_for_block = p.window_size == 0 + ? 0 + : cutlass::fast_max(query_start - p.window_size + 1 + shift, 0); + + if (p.window_size == 0) { + num_key_blocks = last_key_for_block / kBlockSizeJ + 1; + } else { + num_key_blocks = (last_key_for_block / kBlockSizeJ) - + (first_key_for_block / kBlockSizeJ) + 1; + } + + if (last_key_for_block < 0 || first_key_for_block >= p.num_keys) { + num_key_blocks = 0; + } + } + return cutlass::fast_min(p.num_splits_key_device(), num_key_blocks); + }; + + // Returns the next block to process + static CUTLASS_HOST_DEVICE void incrIteration( + Params const& p, + int64_t query_start, + int64_t key_start, + int64_t& next_query, + int64_t& next_key) { + next_query = query_start + kBlockSizeI; + next_key = key_start; + auto query_shift = getQueryStartShift(p); + // Wrap around + if (query_shift) { + if (next_query >= p.num_queries) { + next_query = getSmallestQueryForKey(p, key_start); + return; + } else if (query_start < query_shift && query_shift <= next_query) { + // jump to next key + } else { + return; + } + } else { + if (p.window_size > 0) { + int32_t shift = p.custom_mask_type == CausalFromBottomRight + ? p.num_keys - p.num_queries + : 0; + // last key that is not masked out + int last_key_for_block = + cutlass::fast_min(key_start + kBlockSizeJ, (int64_t)p.num_keys) - 1; + int last_query = last_key_for_block - shift + p.window_size - 1; + if (next_query <= last_query && next_query < p.num_queries) { + return; + } + } else if (next_query < p.num_queries) { + return; + } + // jump to next key + } + // Next key + next_key = key_start + p.num_splits_key_device() * (int64_t)kBlockSizeJ; + next_query = getQueryStart(p, next_key); + } + + template + static CUTLASS_DEVICE void prologueQkNextIteration( + SharedStorage& shared_storage, + Params const& p, + int32_t query_start, + int32_t key_start, + uint8_t warp_id, + uint8_t lane_id) { + if (query_start >= p.num_queries || key_start >= p.num_keys) { + return; + } + + static constexpr bool kReloadK = + kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat; + int thread_id = 32 * warp_id + lane_id; + typename MatmulQK::Mma::IteratorA iterator_A( + {int32_t(p.k_strideM)}, + const_cast(p.key_ptr + key_start * p.k_strideM), + {p.num_keys - key_start, p.head_dim}, + thread_id, + cutlass::MatrixCoord{0, 0}); + + typename MatmulQK::Mma::IteratorB iterator_B( + {int32_t(p.q_strideM)}, + const_cast(p.query_ptr + query_start * p.q_strideM), + {p.head_dim, p.num_queries - query_start}, + thread_id, + cutlass::MatrixCoord{0, 0}); + + MatmulQK::Mma::prologue( + shared_storage.mm_qk_k(), + shared_storage.mm_qk_q(), + iterator_A, + iterator_B, + thread_id, + p.head_dim); + } + + template + static CUTLASS_DEVICE void writeFragsToGmem( + SharedStorage& shared_storage, + OutputFragments& output_frags, + Params const& p, + int32_t key_start, + uint8_t warp_id, + uint8_t lane_id) { + uint16_t thread_id = 32 * warp_id + lane_id; + int32_t num_keys_in_block = skipBoundsChecks + ? MatmulQK::Mma::Shape::kM + : cutlass::fast_min( + MatmulQK::Mma::Shape::kM, p.num_keys - key_start); + typename MatmulGradV::OutputTileIterator outputV_it( + typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()}, + p.grad_value_ptr + key_start * p.gV_strideM(), + {num_keys_in_block, p.head_dim_value}, + thread_id); + accumulateInGmem( + shared_storage.gradV_epilogue_final(), + output_frags.gradV, + outputV_it, + true, + warp_id, + lane_id); + + typename MatmulGradK::OutputTileIterator outputK_it( + typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()}, + p.grad_key_ptr + key_start * p.gK_strideM(), + {num_keys_in_block, + false ? MatmulGradK::ThreadblockShape::kN : p.head_dim}, + thread_id); + accumulateInGmem( + shared_storage.gradK_epilogue_final(), + output_frags.gradK, + outputK_it, + true, + warp_id, + lane_id); + } + + template + static CUTLASS_DEVICE void accumulateInGmem( + typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem, + typename MatmulT::Mma::FragmentC const& accum, + typename MatmulT::OutputTileIterator output_it, + bool first, + uint8_t warp_id, + uint8_t lane_id) { + using DefaultEpilogue = typename MatmulT::DefaultEpilogue; + using DefaultOutputOp = typename MatmulT::DefaultOutputOp; + using Mma = typename MatmulT::Mma; + int thread_id = 32 * warp_id + lane_id; + DISPATCH_BOOL( + first, kIsFirst, ([&]() { + static constexpr auto ScaleType = kIsFirst + ? cutlass::epilogue::thread::ScaleType::Nothing + : cutlass::epilogue::thread::ScaleType::NoBetaScaling; + using EpilogueOutputOp = + typename cutlass::epilogue::thread::LinearCombination< + typename DefaultOutputOp::ElementOutput, + DefaultOutputOp::kCount, + typename DefaultOutputOp::ElementAccumulator, + typename DefaultOutputOp::ElementCompute, + ScaleType>; + using Epilogue = + typename cutlass::epilogue::threadblock::EpiloguePipelined< + typename DefaultEpilogue::Shape, + typename Mma::Operator, + DefaultEpilogue::kPartitionsK, + typename MatmulT::OutputTileIterator, + typename DefaultEpilogue::AccumulatorFragmentIterator, + typename DefaultEpilogue::WarpTileIterator, + typename DefaultEpilogue::SharedLoadIterator, + EpilogueOutputOp, + typename DefaultEpilogue::Padding, + DefaultEpilogue::kFragmentsPerIteration, + true // IterationsUnroll + >; + EpilogueOutputOp rescale({1, 1}); + Epilogue epilogue(epilogue_smem, thread_id, warp_id, lane_id); + epilogue(rescale, output_it, accum, output_it); + })); + } + + template + static CUTLASS_DEVICE void computeDelta( + Params const& p, + int32_t query_start, + uint8_t warp_id, + uint8_t lane_id) { + // Each thread computes one value for Delta + // Depending on warp configuration, we might have multiple + // threads of the same warp working on the same row + using AccessType = cutlass::Array; + static_assert(kNumThreads >= kBlockSizeI, ""); + static constexpr int kNumThreadsPerLine = kNumThreads / kBlockSizeI; + int16_t thread_id = 32 * warp_id + lane_id; + + int16_t laneFirstCol = kElementsPerAccess * (lane_id % kNumThreadsPerLine); + int16_t laneRow = thread_id / kNumThreadsPerLine; + bool rowPred = (query_start + laneRow) < p.num_queries; + bool pred = rowPred; + + // on windows, previous syntax __restrict__ AccessType* + // resulted in error: "restrict" is not allowed + const AccessType* __restrict__ grad_output_ptr = + reinterpret_cast( + p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM + + laneFirstCol); + const AccessType* __restrict__ output_ptr = + reinterpret_cast( + p.output_ptr + (query_start + laneRow) * p.o_strideM() + + laneFirstCol); + + static constexpr int64_t kMaxIters = + kMaxK / (kElementsPerAccess * kNumThreadsPerLine); + constexpr int kPipelineStages = 2; + accum_t delta_value = accum_t(0); + using GlobalLoad = + cutlass::arch::global_load; + AccessType frag_grad_output[kPipelineStages]; + AccessType frag_output[kPipelineStages]; + + auto loadAndIncrement = [&](int ld_pos, bool is_valid) { + frag_grad_output[ld_pos].clear(); + frag_output[ld_pos].clear(); + GlobalLoad(frag_grad_output[ld_pos], grad_output_ptr, is_valid); + GlobalLoad(frag_output[ld_pos], output_ptr, is_valid); + grad_output_ptr += kNumThreadsPerLine; + output_ptr += kNumThreadsPerLine; + }; + + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < kPipelineStages - 1; ++iter) { + int ld_pos = iter % kPipelineStages; + pred = pred && + (laneFirstCol + iter * kElementsPerAccess * kNumThreadsPerLine) < + p.head_dim_value; + loadAndIncrement(ld_pos, pred); + } + auto columnIteration = [&](int iter) { + // Load for next iter + int ld_pos = (iter + kPipelineStages - 1) % kPipelineStages; + pred = pred && + (laneFirstCol + + (iter + kPipelineStages - 1) * kElementsPerAccess * + kNumThreadsPerLine) < p.head_dim_value; + loadAndIncrement(ld_pos, pred); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < AccessType::kElements; ++i) { + delta_value += accum_t(frag_output[iter % kPipelineStages][i]) * + accum_t(frag_grad_output[iter % kPipelineStages][i]); + } + }; + + // If we have a small lower-bound for K, we can unroll the loop + if (kMaxK <= 256) { + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < kMaxIters; ++iter) { + columnIteration(iter); + } + } else { + int num_iters = + ceil_div(p.head_dim_value, kElementsPerAccess * kNumThreadsPerLine) * + (kElementsPerAccess * kNumThreadsPerLine); + for (int iter = 0; iter < num_iters; ++iter) { + columnIteration(iter); + } + } + + // Reduce between workers + static_assert( + kNumThreadsPerLine == 1 || kNumThreadsPerLine == 2 || + kNumThreadsPerLine == 4, + ""); + CUTLASS_PRAGMA_UNROLL + for (int i = 1; i < kNumThreadsPerLine; i *= 2) { + delta_value = delta_value + __shfl_xor_sync(0xffffffff, delta_value, i); + } + + // Store in gmem + if (rowPred) { + p.delta_ptr[query_start + laneRow] = delta_value; + } + } +}; + +template +__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) + attention_kernel_backward_batched_impl(typename AK::Params p) { + if (!p.advance_to_block()) { + return; + } + AK::attention_kernel(p); +} + +template +__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) + attention_kernel_backward_batched(typename AK::Params params); + +} // namespace PyTorchMemEffAttention + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h new file mode 100644 index 0000000000000000000000000000000000000000..3075286eb0ca590303e37e84429ba977e39fd69e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h @@ -0,0 +1,1358 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +using namespace gemm_kernel_utils; + +namespace PyTorchMemEffAttention { +namespace { +template +constexpr int getWarpsPerSmFw() { + return ( + Arch::kMinComputeCapability >= 80 && + !cutlass::platform::is_same::value + ? 16 + : 12); +} +static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) { + // source: https://stackoverflow.com/a/51549250 + return !signbit(value) + ? __int_as_float(atomicMax((int *)addr, __float_as_int(value))) + : __uint_as_float( + atomicMin((unsigned int *)addr, __float_as_uint(value))); +} +} // namespace + +template < + // The datatype of Q/K/V + typename scalar_t_, + // Architecture we are targeting (eg `cutlass::arch::Sm80`) + typename ArchTag, + // If Q/K/V are correctly aligned in memory and we can run a fast kernel + bool isAligned_, + int kQueriesPerBlock_, + int kKeysPerBlock_, + // upperbound on `max(value.shape[-1], query.shape[-1])` + int kMaxK_ = (int)cutlass::platform::numeric_limits::max(), + // This is quite slower on V100 for some reason + // Set to false if you know at compile-time you will never need dropout + bool kSupportsDropout_ = true, + bool kSupportsBias_ = true> +struct AttentionKernel { + enum CustomMaskType { + NoCustomMask = 0, + CausalFromTopLeft = 1, + CausalFromBottomRight = 2, + NumCustomMaskTypes, + }; + + using scalar_t = scalar_t_; + using accum_t = float; + using lse_scalar_t = float; + using output_t = scalar_t; + // Accumulator between 2 iterations + // Using `accum_t` improves perf on f16 at the cost of + // numerical errors + using output_accum_t = accum_t; + static constexpr bool kSupportsDropout = kSupportsDropout_; + static constexpr bool kSupportsBias = kSupportsBias_; + static constexpr int kKeysPerBlock = kKeysPerBlock_; + static constexpr int kQueriesPerBlock = kQueriesPerBlock_; + static constexpr int kMaxK = kMaxK_; + static constexpr bool kIsAligned = isAligned_; + static constexpr bool kSingleValueIteration = kMaxK <= kKeysPerBlock; + static constexpr int32_t kAlignLSE = 32; // block size of backward + static constexpr bool kIsHalf = cutlass::sizeof_bits::value == 16; + static constexpr bool kPreloadV = + ArchTag::kMinComputeCapability >= 80 && kIsHalf; + static constexpr bool kKeepOutputInRF = kSingleValueIteration; + static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF && + !cutlass::platform::is_same::value; + + static_assert(kQueriesPerBlock % 32 == 0, ""); + static_assert(kKeysPerBlock % 32 == 0, ""); + static constexpr int kNumWarpsPerBlock = + kQueriesPerBlock * kKeysPerBlock / (32 * 32); + static constexpr int kWarpSize = 32; + + // Launch bounds + static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock; + static constexpr int kMinBlocksPerSm = + getWarpsPerSmFw() / kNumWarpsPerBlock; + + struct Params { + // Input tensors + const scalar_t* query_ptr = nullptr; // [num_queries, num_heads, head_dim] + const scalar_t* key_ptr = nullptr; // [num_keys, num_heads, head_dim] + const scalar_t* value_ptr = nullptr; // [num_keys, num_heads, head_dim_value] + const scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys] + const int32_t* seqstart_q_ptr = nullptr; + const int32_t* seqstart_k_ptr = nullptr; + + const int32_t* seqlen_k_ptr = nullptr; + uint32_t causal_diagonal_offset = 0; + + // Output tensors + output_t* output_ptr = nullptr; // [num_queries, num_heads, head_dim_value] + // [num_queries, num_heads, head_dim_value] + output_accum_t* output_accum_ptr = nullptr; + // [num_heads, num_queries] - can be null + lse_scalar_t* logsumexp_ptr = nullptr; + + // Sliding window. ignored if == 0 + int32_t window_size = 0; + + // Scale + accum_t scale = 0.0; + + // Dimensions/strides + int32_t head_dim = 0; + int32_t head_dim_value = 0; + int32_t num_queries = 0; + int32_t num_keys = 0; + int32_t num_keys_absolute = 0; + + uint8_t custom_mask_type = NoCustomMask; + + int32_t q_strideM = 0; + int32_t k_strideM = 0; + int32_t v_strideM = 0; + int32_t bias_strideM = 0; + + int32_t o_strideM = 0; + + // Everything below is only used in `advance_to_block` + // and shouldn't use registers + int32_t q_strideH = 0; + int32_t k_strideH = 0; + int32_t v_strideH = 0; + int64_t bias_strideH = 0; + + int64_t q_strideB = 0; + int64_t k_strideB = 0; + int64_t v_strideB = 0; + int64_t bias_strideB = 0; + + int32_t num_batches = 0; + int32_t num_heads = 0; + + // dropout + bool use_dropout = false; + unsigned long long dropout_batch_head_rng_offset = 0; + float dropout_prob = 0.0f; + at::PhiloxCudaState rng_engine_inputs = at::PhiloxCudaState(0, 0); + int64_t* extragraph_offset = nullptr; + int64_t* seed = nullptr; + + // Moves pointers to what we should process + // Returns "false" if there is no work to do + CUTLASS_DEVICE bool advance_to_block() { + auto batch_id = blockIdx.z; + auto head_id = blockIdx.y; + auto query_start = blockIdx.x * kQueriesPerBlock; + + auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE; + + if (kSupportsDropout) { + dropout_batch_head_rng_offset = + batch_id * num_heads * num_queries * num_keys + + head_id * num_queries * num_keys; + } + + int64_t q_start = 0, k_start = 0; + // Advance to current batch - in case of different sequence lengths + if (seqstart_q_ptr != nullptr) { + assert(seqstart_k_ptr != nullptr); + seqstart_q_ptr += batch_id; + + q_start = seqstart_q_ptr[0]; + int64_t q_next_start = seqstart_q_ptr[1]; + int64_t k_end; + seqstart_k_ptr += batch_id; + + if (seqlen_k_ptr) { + k_start = seqstart_k_ptr[0]; + k_end = k_start + seqlen_k_ptr[batch_id]; + } else { + k_start = seqstart_k_ptr[0]; + k_end = seqstart_k_ptr[1]; + } + + num_queries = q_next_start - q_start; + num_keys = k_end - k_start; + + if (query_start >= num_queries) { + return false; + } + } else { + query_ptr += batch_id * q_strideB; + key_ptr += batch_id * k_strideB; + value_ptr += batch_id * v_strideB; + output_ptr += int64_t(batch_id * num_queries) * o_strideM; + if (output_accum_ptr != nullptr) { + output_accum_ptr += + int64_t(batch_id * num_queries) * (head_dim_value * num_heads); + } + q_start = 0; + k_start = 0; + } + + // Advance to the current batch / head / query_start + query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH; + key_ptr += k_start * k_strideM + head_id * k_strideH; + + value_ptr += k_start * v_strideM + head_id * v_strideH; + output_ptr += + int64_t(q_start + query_start) * o_strideM + head_id * head_dim_value; + + if (kSupportsBias && attn_bias_ptr != nullptr) { + attn_bias_ptr += (batch_id * bias_strideB) + (head_id * bias_strideH); + } + if (output_accum_ptr != nullptr) { + output_accum_ptr += + int64_t(q_start + query_start) * (head_dim_value * num_heads) + + head_id * head_dim_value; + } else { + // Accumulate directly in the destination buffer (eg for f32) + output_accum_ptr = (accum_t*)output_ptr; + } + + if (logsumexp_ptr != nullptr) { + // lse[batch_id, head_id, query_start] + logsumexp_ptr += + batch_id * lse_dim * num_heads + head_id * lse_dim + query_start; + } + + // Custom masking + if (custom_mask_type == CausalFromBottomRight) { + causal_diagonal_offset = num_keys - num_queries; + } + // We use num_keys_absolute to index into the rng_state + // We need this index to match between forward and backwards + num_keys_absolute = num_keys; + if (custom_mask_type == CausalFromTopLeft || + custom_mask_type == CausalFromBottomRight) { + // the bottom row of the current block is query_start + kQueriesPerBlock + // the last active key is then query_start + causal_diagonal_offset + + // kQueriesPerBlock so num_keys is the min between actual num_keys and + // this to avoid extra computations + num_keys = cutlass::fast_min( + int32_t(query_start + causal_diagonal_offset + kQueriesPerBlock), + num_keys); + } + + num_queries -= query_start; + num_batches = 0; // no longer used after + + // If num_queries == 1, and there is only one key head we're wasting + // 15/16th of tensor core compute In that case : + // - we only launch kernels for head_id % kQueriesPerBlock == 0 + // - we iterate over heads instead of queries (strideM = strideH) + if (num_queries == 1 && k_strideH == 0 && v_strideH == 0 && + logsumexp_ptr == nullptr && window_size == 0) { + if (head_id % kQueriesPerBlock != 0) { + return false; + } + q_strideM = q_strideH; + bias_strideM = bias_strideH; + num_queries = num_heads; + num_heads = 1; // unused but here for intent + // remove causal since n_query = 1 + // otherwise, offset would change with head ! + custom_mask_type = NoCustomMask; + o_strideM = head_dim_value; + } + + // Make sure the compiler knows these variables are the same on all + // the threads of the warp. + // Only worth doing if they could have been modified above. + query_ptr = warp_uniform(query_ptr); + key_ptr = warp_uniform(key_ptr); + value_ptr = warp_uniform(value_ptr); + if (kSupportsBias) { + attn_bias_ptr = warp_uniform(attn_bias_ptr); + } + output_ptr = warp_uniform(output_ptr); + output_accum_ptr = warp_uniform(output_accum_ptr); + logsumexp_ptr = warp_uniform(logsumexp_ptr); + num_queries = warp_uniform(num_queries); + num_keys = warp_uniform(num_keys); + num_heads = warp_uniform(num_heads); + o_strideM = warp_uniform(o_strideM); + custom_mask_type = warp_uniform(custom_mask_type); + return true; + } + + __host__ dim3 getBlocksGrid() const { + return dim3( + ceil_div(num_queries, (int32_t)kQueriesPerBlock), + num_heads, + num_batches); + } + + __host__ dim3 getThreadsGrid() const { + return dim3(kWarpSize, kNumWarpsPerBlock, 1); + } + }; + + struct MM0 { + /* + In this first matmul, we compute a block of `Q @ K.T`. + While the calculation result is still hot in registers, we update + `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value + into a shared-memory ("AccumulatorSharedStorage") that is used later as + operand A for the second matmul (see MM1) + */ + using GemmType = DefaultGemmType; + + using OpClass = typename GemmType::OpClass; + using DefaultConfig = + typename cutlass::gemm::device::DefaultGemmConfiguration< + OpClass, + ArchTag, + scalar_t, + scalar_t, + scalar_t, // ElementC + accum_t // ElementAccumulator + >; + static constexpr int kAlignmentA = + kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment; + static constexpr int kAlignmentB = + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment; + using ThreadblockShape = cutlass::gemm:: + GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + kAlignmentA, + scalar_t, // ElementB, + cutlass::layout::ColumnMajor, // LayoutB, + kAlignmentB, + accum_t, + cutlass::layout::RowMajor, // LayoutC, + OpClass, + ArchTag, // ArchTag + ThreadblockShape, // ThreadblockShape + WarpShape, // WarpShape + typename GemmType::InstructionShape, // InstructionShape + ArchTag::kMinComputeCapability >= 80 && kIsHalf + ? 4 + : DefaultConfig::kStages, + typename GemmType::Operator // Operator + >::DefaultMma; + using MmaCore = typename DefaultMma::MmaCore; + using IteratorA = typename DefaultMma::IteratorA; + using IteratorB = typename DefaultMma::IteratorB; + using DefaultThreadblockMma = typename DefaultMma::ThreadblockMma; + using Mma = typename cutlass::platform::conditional< + kSingleValueIteration, + typename MakeCustomMma::Mma, + DefaultThreadblockMma>::type; + using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator< + typename Mma::Operator::IteratorC, + accum_t, + kWarpSize>::Iterator; + static_assert( + MmaCore::WarpCount::kM * MmaCore::WarpCount::kN * + MmaCore::WarpCount::kK == + kNumWarpsPerBlock, + ""); + + // used for efficient load of bias tile Bij from global to shared memory + using BiasLoader = TileSmemLoader< + scalar_t, + cutlass::MatrixShape, + MmaCore::kThreads, + // input restriction: kv_len has to be a multiple of this value + 128 / cutlass::sizeof_bits::value>; + + // Epilogue to store to shared-memory in a format that we can use later for + // the second matmul + using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm< + typename Mma::Operator::IteratorC, + typename Mma::Operator, + scalar_t, + WarpShape, + ThreadblockShape>; + using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage; + }; + + struct MM1 { + /** + Second matmul: perform `attn @ V` where `attn` is the attention (not + normalized) and stored in shared memory + */ + using GemmType = DefaultGemmType; + + using OpClass = typename GemmType::OpClass; + using DefaultConfig = + typename cutlass::gemm::device::DefaultGemmConfiguration< + OpClass, + ArchTag, + scalar_t, + scalar_t, + output_accum_t, // ElementC + accum_t // ElementAccumulator + >; + static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem + static constexpr int kAlignmentB = + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment; + using ThreadblockShape = cutlass::gemm:: + GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using InstructionShape = typename GemmType::InstructionShape; + + using LayoutB = cutlass::layout::RowMajor; + using DefaultGemm = cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + kAlignmentA, + scalar_t, // ElementB, + LayoutB, // LayoutB, + kAlignmentB, + output_accum_t, + cutlass::layout::RowMajor, // LayoutC, + accum_t, + OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + typename DefaultConfig::EpilogueOutputOp, + void, // ThreadblockSwizzle - not used + ArchTag::kMinComputeCapability >= 80 && kIsHalf + ? 4 + : DefaultConfig::kStages, + false, // SplitKSerial + typename GemmType::Operator>; + + using WarpIteratorA = typename cutlass::gemm::threadblock:: + DefaultWarpIteratorAFromSharedMemory< + typename DefaultGemm::Mma::Policy::Operator::Shape, // WarpShape + typename DefaultGemm::Mma::Policy::Operator::InstructionShape, + typename DefaultGemm::Mma::Policy::Operator::IteratorA, + typename DefaultGemm::Mma::Policy>::WarpIterator; + using DefaultMmaFromSmem = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MM0::AccumulatorSharedStorage::Shape::kN, // kMaxK + WarpIteratorA, + false>; // kScaleOperandA + using Mma = typename DefaultMmaFromSmem::Mma; + using IteratorB = typename Mma::IteratorB; + using WarpCount = typename Mma::WarpCount; + static_assert( + WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock, + ""); + + using DefaultEpilogue = typename DefaultGemm::Epilogue; + using OutputTileIterator = + typename cutlass::epilogue::threadblock::PredicatedTileIterator< + typename DefaultEpilogue::OutputTileIterator::ThreadMap, + output_t>; + using OutputTileIteratorAccum = + typename cutlass::epilogue::threadblock::PredicatedTileIterator< + typename DefaultEpilogue::OutputTileIterator::ThreadMap, + output_accum_t>; + }; + + static constexpr int64_t kAlignmentQ = MM0::kAlignmentA; + static constexpr int64_t kAlignmentK = MM0::kAlignmentB; + static constexpr int64_t kAlignmentV = 1; + + // Shared storage - depends on kernel params + struct ScalingCoefs { + cutlass::Array m_prime; + cutlass::Array s_prime; + cutlass::Array mi; + cutlass::Array out_rescale; + cutlass::Array + addition_storage; + }; + + struct SharedStorageEpilogueAtEnd : ScalingCoefs { + struct SharedStorageAfterMM0 { + // Everything here might be overwritten during MM0 + union { + typename MM0::BiasLoader::SmemTile bias; + typename MM0::AccumulatorSharedStorage si; + }; + typename MM1::Mma::SharedStorage mm1; + }; + + union { + typename MM0::Mma::SharedStorage mm0; + SharedStorageAfterMM0 after_mm0; + typename MM1::DefaultEpilogue::SharedStorage epilogue; + }; + + CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage& + epilogue_shared_storage() { + return epilogue; + } + }; + + struct SharedStorageEpilogueInLoop : ScalingCoefs { + struct SharedStorageAfterMM0 { + // Everything here might be overwritten during MM0 + union { + typename MM0::BiasLoader::SmemTile bias; + typename MM0::AccumulatorSharedStorage si; + }; + typename MM1::Mma::SharedStorage mm1; + typename MM1::DefaultEpilogue::SharedStorage epilogue; + }; + + union { + typename MM0::Mma::SharedStorage mm0; + SharedStorageAfterMM0 after_mm0; + }; + + CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage& + epilogue_shared_storage() { + return after_mm0.epilogue; + } + }; + + using SharedStorage = typename cutlass::platform::conditional< + kSingleValueIteration || kKeepOutputInRF, + SharedStorageEpilogueAtEnd, + SharedStorageEpilogueInLoop>::type; + + static bool __host__ check_supported(Params const& p) { + CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ); + CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK); + CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV); + if (kSupportsBias) { + CHECK_ALIGNED_PTR(p.attn_bias_ptr, kAlignmentQ); + TORCH_CHECK( + p.num_batches <= 1 || p.bias_strideB % kAlignmentQ == 0, + "attn_bias is not correctly aligned (strideB). ", + "attn_bias.stride( 0) = ", p.bias_strideB, ", and should be a " + "multiple of ", kAlignmentQ, "."); + TORCH_CHECK( + p.num_heads <= 1 || p.bias_strideH % kAlignmentQ == 0, + "attn_bias is not correctly aligned (strideH). " + "attn_bias.stride(1) = ", p.bias_strideH, ", and should be a " + "multiple of ", kAlignmentQ, "."); + TORCH_CHECK( + p.num_queries <= 1 || p.bias_strideM % kAlignmentQ == 0, + "attn_bias is not correctly aligned (strideM). " + "attn_bias.stride(2) = ", p.bias_strideM, ", and should be a " + "multiple of ", kAlignmentQ, "."); + } + TORCH_CHECK( + p.q_strideM % kAlignmentQ == 0, + "query is not correctly aligned (strideM)"); + TORCH_CHECK( + p.k_strideM % kAlignmentK == 0, + "key is not correctly aligned (strideM)"); + TORCH_CHECK( + p.v_strideM % kAlignmentV == 0, + "value is not correctly aligned (strideM)"); + TORCH_CHECK( + p.num_heads <= 1 || p.q_strideH % kAlignmentQ == 0, + "query is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_heads <= 1 || p.k_strideH % kAlignmentK == 0, + "key is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_heads <= 1 || p.v_strideH % kAlignmentV == 0, + "value is not correctly aligned (strideH)"); + TORCH_CHECK( + p.custom_mask_type < NumCustomMaskTypes, + "invalid value for `custom_mask_type`"); + if (p.window_size > 0) { + TORCH_CHECK( + p.custom_mask_type == CausalFromTopLeft || + p.custom_mask_type == CausalFromBottomRight, + "custom_mask_type not supported"); + } + return true; + } + + static void CUTLASS_DEVICE attention_kernel(Params& p) { + // In this block, we will only ever: + // - read query[query_start:query_end, :] + // - write to output[query_start:query_end, :] + + extern __shared__ char smem_buffer[]; + SharedStorage& shared_storage = *((SharedStorage*)smem_buffer); + auto& m_prime = shared_storage.m_prime; + auto& s_prime = shared_storage.s_prime; + auto& mi = shared_storage.mi; + auto& out_rescale = shared_storage.out_rescale; + const uint32_t query_start = blockIdx.x * kQueriesPerBlock; + + static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, ""); + if (thread_id() < kQueriesPerBlock) { + s_prime[thread_id()] = accum_t(0); + out_rescale[thread_id()] = accum_t(1.0); + m_prime[thread_id()] = + -cutlass::platform::numeric_limits::infinity(); + mi[thread_id()] = -cutlass::platform::numeric_limits::infinity(); + } + typename MM1::Mma::FragmentC accum_o; + accum_o.clear(); + + auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator { + using OutputTileIterator = typename MM1::OutputTileIterator; + return OutputTileIterator( + typename OutputTileIterator::Params{(int32_t)p.o_strideM}, + p.output_ptr, + typename OutputTileIterator::TensorCoord{ + p.num_queries, p.head_dim_value}, + thread_id(), + {0, col}); + }; + + auto createOutputAccumIter = [&](int col) -> + typename MM1::OutputTileIteratorAccum { + using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum; + return OutputTileIteratorAccum( + typename OutputTileIteratorAccum::Params{ + (int32_t)(p.head_dim_value * p.num_heads)}, + p.output_accum_ptr, + typename OutputTileIteratorAccum::TensorCoord{ + p.num_queries, p.head_dim_value}, + thread_id(), + {0, col}); + }; + + curandStatePhilox4_32_10_t curand_state_init; + if (kSupportsDropout && p.use_dropout) { + const auto [seed, offset] = at::cuda::philox::unpack(p.rng_engine_inputs); + if (p.rng_engine_inputs.captured_) { + // See Note [Seed and Offset Device] + // When we are in cuda graph capture mode the seed and offset are stored + // on device We pass in int64_t* seed, and int64_t* offset to act as + // scratch space for storing the rng state during the forward pass and + // saving for backwards. + *p.seed = seed; + *p.extragraph_offset = offset; + } + // each element of the attention matrix P with shape + // (batch_sz, n_heads, n_queries, n_keys) is associated with a single + // offset in RNG sequence. we initialize the RNG state with offset that + // starts at the beginning of a (n_queries, n_keys) matrix for this + // block's batch_id and head_id + // initializing rng state is very expensive, so we run once per kernel, + // rather than once per iteration. each iteration takes a copy of the + // initialized RNG state and offsets it as needed. + curand_init( + seed, + 0, + offset + p.dropout_batch_head_rng_offset, + &curand_state_init); + } + + // Iterate through keys + for (int32_t iter_key_start = 0; iter_key_start < p.num_keys; + iter_key_start += kKeysPerBlock) { + if (p.window_size > 0) { + // don't compute anything if below attention band + if (iter_key_start + kKeysPerBlock < + int32_t(query_start + p.causal_diagonal_offset) - p.window_size) { + continue; + } + } + int32_t problem_size_0_m = + cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries); + int32_t problem_size_0_n = cutlass::fast_min( + int32_t(kKeysPerBlock), p.num_keys - iter_key_start); + int32_t const& problem_size_0_k = p.head_dim; + int32_t const& problem_size_1_n = p.head_dim_value; + int32_t const& problem_size_1_k = problem_size_0_n; + + auto prologueV = [&](int blockN) { + typename MM1::Mma::IteratorB iterator_V( + typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)}, + const_cast(p.value_ptr + iter_key_start * p.v_strideM), + {problem_size_1_k, problem_size_1_n}, + thread_id(), + cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN}); + MM1::Mma::prologue( + shared_storage.after_mm0.mm1, + iterator_V, + thread_id(), + problem_size_1_k); + }; + + __syncthreads(); // Need to have shared memory initialized, and `m_prime` + // updated from end of prev iter + // + // MATMUL: Q.K_t + // + // Computes the block-matrix product of: + // (a) query[query_start:query_end, :] + // with + // (b) key[iter_key_start:iter_key_start + kKeysPerBlock] + // and stores that into `shared_storage.si` + // + + // Compute threadblock location + cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0}; + + cutlass::MatrixCoord tb_offset_A{ + tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()}; + + cutlass::MatrixCoord tb_offset_B{ + tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN}; + + // Construct iterators to A and B operands + typename MM0::IteratorA iterator_A( + typename MM0::IteratorA::Params( + typename MM0::MmaCore::LayoutA(p.q_strideM)), + const_cast(p.query_ptr), + {problem_size_0_m, problem_size_0_k}, + thread_id(), + tb_offset_A); + + typename MM0::IteratorB iterator_B( + typename MM0::IteratorB::Params( + typename MM0::MmaCore::LayoutB(p.k_strideM)), + const_cast(p.key_ptr + iter_key_start * p.k_strideM), + {problem_size_0_k, problem_size_0_n}, + thread_id(), + tb_offset_B); + + auto my_warp_id = warp_uniform(warp_id()); + auto my_lane_id = lane_id(); + + // Construct thread-scoped matrix multiply + typename MM0::Mma mma( + shared_storage.mm0, thread_id(), my_warp_id, my_lane_id); + + typename MM0::Mma::FragmentC accum; + + accum.clear(); + + auto gemm_k_iterations = + (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum); + __syncthreads(); + + if (kPreloadV) { + prologueV(0); + } + + typename MM0::Mma::Operator::IteratorC::TensorCoord + iteratorC_tile_offset = { + (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) + + (my_warp_id % MM0::Mma::WarpCount::kM), + (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) + + (my_warp_id / MM0::Mma::WarpCount::kM)}; + + // multiply by scaling factor + if (kSupportsBias) { + accum = + cutlass::multiplies()(p.scale, accum); + } + + // apply attention bias if applicable + if (kSupportsBias && p.attn_bias_ptr != nullptr) { + // load bias tile Bij into shared memory + typename MM0::BiasLoader::GmemTileIterator bias_iter( + {cutlass::layout::RowMajor(p.bias_strideM)}, + // attn_bias_pointer points to matrix of size (n_queries, n_keys) + // for the relevant batch_id and head_id + const_cast(p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start), + {problem_size_0_m, problem_size_0_n}, + thread_id()); + cutlass::TensorRef bias_tensor_ref( + shared_storage.after_mm0.bias.data(), + cutlass::layout::RowMajor(MM0::ThreadblockShape::kN)); + typename MM0::BiasLoader::SmemTileIterator smem_tile_iter( + bias_tensor_ref, thread_id()); + MM0::BiasLoader::load(bias_iter, smem_tile_iter); + + // Pij += Bij, Pij is in register fragment and Bij is in shared memory + auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset( + my_lane_id, my_warp_id, iteratorC_tile_offset); + MM0::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) { + accum[idx] += bias_tensor_ref.at({accum_m, accum_n}); + } + }, + [&](int accum_m) {}); + } + + // Mask out last if causal + // This is only needed if upper-right corner of current query / key block + // intersects the mask Coordinates of upper-right corner of current block + // is y=query_start x=min(iter_key_start + kKeysPerBlock, num_keys)) The + // first masked element is x = y + offset -> query_start + offset There is + // intersection (and we need to mask) if min(iter_key_start + + // kKeysPerBlock, num_keys)) >= query_start + offset + if (p.custom_mask_type && + cutlass::fast_min(iter_key_start + kKeysPerBlock, p.num_keys) >= + (query_start + p.causal_diagonal_offset)) { + auto query_start = blockIdx.x * kQueriesPerBlock; + auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset( + my_lane_id, my_warp_id, iteratorC_tile_offset); + int32_t last_col; + MM0::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { + // last absolute col is (last absolute query + offset) + // last local col is (last absolute query + offset - + // iter_key_start) + last_col = query_start + accum_m + p.causal_diagonal_offset - + iter_key_start; + }, + [&](int accum_m, int accum_n, int idx) { + if (accum_n > last_col) { + accum[idx] = + -cutlass::platform::numeric_limits::infinity(); + } + }, + [&](int accum_m) {}); + } + + // Mask out lower left corner of block if window_size > 0 + // only required if current block intersects with the lower left corner + // block starts at x_lowerleft = iter_key_start // y = query_start + + // kQueriesPerBlock first non masked value at this y is : x_first = + // query_start + kQueriesPerBlock - window_size mask if x_fist > + // x_lowerleft + + if (p.window_size > 0 && + (query_start + p.causal_diagonal_offset + + cutlass::fast_min( + int32_t(kQueriesPerBlock), int32_t(p.num_queries)) - + p.window_size >= + iter_key_start)) { + auto query_start = blockIdx.x * kQueriesPerBlock; + auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset( + my_lane_id, my_warp_id, iteratorC_tile_offset); + int32_t first_col; + const int32_t offset = query_start + p.causal_diagonal_offset - + p.window_size - iter_key_start; + MM0::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { first_col = accum_m + offset; }, + [&](int accum_m, int accum_n, int idx) { + if (accum_n <= first_col) { + accum[idx] = + -cutlass::platform::numeric_limits::infinity(); + } + }, + [&](int accum_m) {}); + // print_warp_accum(accum, lane_offset, 12, + // 12); + } + + // Update `mi` from accum stored in registers + // Also does accum[i] <- exp(accum[i] - mi) + iterative_softmax( + accum_o, + accum, + mi, + m_prime, + s_prime, + out_rescale, + shared_storage.addition_storage, + my_lane_id, + thread_id(), + my_warp_id, + p.num_keys - iter_key_start, + iter_key_start == 0, + iteratorC_tile_offset, + kSupportsBias ? 1.0f : p.scale); + + // Output results to shared-memory + int warp_idx_mn_0 = my_warp_id % + (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN); + auto output_tile_coords = cutlass::MatrixCoord{ + warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM, + warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM}; + + MM0::B2bGemm::accumToSmem( + shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords); + + __syncthreads(); + + // apply dropout (if applicable) after we've written Pij to smem. + // dropout is applied by multiplying each element of Pij by: + // - 0 with probability dropout_p + // - 1 / (1 - dropout_p) with probability 1 - dropout_p + // + // for backward purposes we want to be able to map each element of the + // attention matrix to the same random uniform number as the one we used + // in forward, without needing to use the same iteration order or having + // to store the dropout matrix. its possible to do this in registers but + // it ends up being very slow because each thread having noncontiguous + // strips of the Pij tile means we have to skip around a lot, and also + // have to generate a single random number at a time + if (kSupportsDropout && p.use_dropout) { + auto si = shared_storage.after_mm0.si.accum_ref(); + // each thread handles a contiguous sequence of elements from Sij, all + // coming from the same row. the reason they have to come from the same + // row is that the sampling random numbers from a contiguous random + // number sequence is much more efficient than jumping around, and the + // linear offset of each element of S (the global matrix) maps to an + // offset in a random number sequence. for S, the end of a row and the + // beginning of the next have adjacent offsets, but for Sij, this is not + // necessarily the case. + const int num_threads = blockDim.x * blockDim.y * blockDim.z; + const int threads_per_row = + cutlass::fast_min(num_threads / problem_size_0_m, problem_size_0_n); + const int elts_per_thread = cutlass::round_nearest( + cutlass::ceil_div(problem_size_0_n, threads_per_row), 4); + + const int thread_i = thread_id() / threads_per_row; + const int thread_start_j = + (thread_id() % threads_per_row) * elts_per_thread; + + if (thread_i < problem_size_0_m && thread_start_j < problem_size_0_n) { + curandStatePhilox4_32_10_t curand_state = curand_state_init; + skipahead( + static_cast( + (query_start + thread_i) * p.num_keys_absolute + + (iter_key_start + thread_start_j)), + &curand_state); + const float dropout_scale = 1.0 / (1.0 - p.dropout_prob); + + // apply dropout scaling to elements this thread is responsible for, + // in chunks of 4 + for (int sij_start_col_idx = thread_start_j; sij_start_col_idx < + cutlass::fast_min(thread_start_j + elts_per_thread, + problem_size_0_n); + sij_start_col_idx += 4) { + const float4 rand_uniform_quad = curand_uniform4(&curand_state); + + CUTLASS_PRAGMA_UNROLL + for (int quad_idx = 0; quad_idx < 4; ++quad_idx) { + si.at({thread_i, sij_start_col_idx + quad_idx}) *= + static_cast( + dropout_scale * + ((&rand_uniform_quad.x)[quad_idx] > p.dropout_prob)); + } + } + } + __syncthreads(); // p.use_dropout should have same value kernel-wide + } + + // + // MATMUL: Attn . V + // Run the matmul `attn @ V` for a block of attn and V. + // `attn` is read from shared memory (in `shared_storage_si`) + // `V` is read from global memory (with iterator_B) + // + + const int64_t nBlockN = kSingleValueIteration + ? 1 + : ceil_div( + (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN)); + for (int blockN = 0; blockN < nBlockN; ++blockN) { + int gemm_k_iterations = + (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add and store it in accum + // (in registers) + if (!kPreloadV) { + __syncthreads(); // we share shmem between mma and epilogue + } + + typename MM1::Mma::IteratorB iterator_V( + typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)}, + const_cast(p.value_ptr + iter_key_start * p.v_strideM), + {problem_size_1_k, problem_size_1_n}, + thread_id(), + cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN}); + typename MM1::Mma mma_pv( + // operand A: Pij_dropped in shared memory + shared_storage.after_mm0.si.accum_ref(), + // operand B: shared memory staging area for Vj, which is loaded + // from global memory + shared_storage.after_mm0.mm1.operand_B_ref(), + (int)thread_id(), + (int)my_warp_id, + (int)my_lane_id); + mma_pv.set_prologue_done(kPreloadV); + if (!kKeepOutputInRF) { + accum_o.clear(); + } + mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o); + __syncthreads(); + + if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) { + prologueV(blockN + 1); + } + + if (!kKeepOutputInRF) { + int first_key = 0; + if (p.window_size > 0) { + first_key = (cutlass::fast_max( + int(query_start + p.causal_diagonal_offset) - + p.window_size + 1, + 0) / + kKeysPerBlock) * + kKeysPerBlock; + } + + // int first_key_block = 0; + // MM1::Mma::drain_cp_asyncs(); # TODO figure out if this is needed for correctness + DISPATCH_BOOL( + iter_key_start == first_key, kIsFirst, ([&] { + DISPATCH_BOOL( + (iter_key_start + kKeysPerBlock) >= p.num_keys, + kIsLast, + ([&] { + using DefaultEpilogue = typename MM1::DefaultEpilogue; + using DefaultOp = + typename MM1::DefaultConfig::EpilogueOutputOp; + using ElementCompute = typename DefaultOp::ElementCompute; + using EpilogueOutputOp = typename cutlass::epilogue:: + thread::MemoryEfficientAttentionNormalize< + typename cutlass::platform::conditional< + kIsLast, + output_t, + output_accum_t>::type, + output_accum_t, + DefaultOp::kCount, + typename DefaultOp::ElementAccumulator, + ElementCompute, + kIsFirst, + kIsLast, + cutlass::Array>; + using Epilogue = typename cutlass::epilogue::threadblock:: + EpiloguePipelined< + typename DefaultEpilogue::Shape, + typename MM1::Mma::Operator, + DefaultEpilogue::kPartitionsK, + typename cutlass::platform::conditional< + kIsLast, + typename MM1::OutputTileIterator, + typename MM1::OutputTileIteratorAccum>::type, + typename DefaultEpilogue:: + AccumulatorFragmentIterator, + typename DefaultEpilogue::WarpTileIterator, + typename DefaultEpilogue::SharedLoadIterator, + EpilogueOutputOp, + typename DefaultEpilogue::Padding, + DefaultEpilogue::kFragmentsPerIteration, + true, // IterationsUnroll + typename MM1::OutputTileIteratorAccum // Read + // iterator + >; + + int col = blockN * MM1::Mma::Shape::kN; + auto source_iter = createOutputAccumIter(col); + auto dest_iter = call_conditional< + kIsLast, + decltype(createOutputIter), + decltype(createOutputAccumIter)>:: + apply(createOutputIter, createOutputAccumIter, col); + EpilogueOutputOp rescale(s_prime, out_rescale); + Epilogue epilogue( + shared_storage.epilogue_shared_storage(), + thread_id(), + my_warp_id, + my_lane_id); + epilogue(rescale, dest_iter, accum_o, source_iter); + })); + })); + if (!kSingleValueIteration) { + __syncthreads(); + } + } + } + __syncthreads(); // we modify `m_prime` after + } + + if (kKeepOutputInRF) { + constexpr bool kIsFirst = true; + constexpr bool kIsLast = true; + using DefaultEpilogue = typename MM1::DefaultEpilogue; + using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp; + using ElementCompute = typename DefaultOp::ElementCompute; + using EpilogueOutputOp = + typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize< + output_t, // output + output_accum_t, // source + DefaultOp::kCount, + typename DefaultOp::ElementAccumulator, // accum + output_accum_t, // compute + kIsFirst, + kIsLast, + cutlass::Array>; + using Epilogue = + typename cutlass::epilogue::threadblock::EpiloguePipelined< + typename DefaultEpilogue::Shape, + typename MM1::Mma::Operator, + DefaultEpilogue::kPartitionsK, + typename MM1::OutputTileIterator, // destination + typename DefaultEpilogue::AccumulatorFragmentIterator, + typename DefaultEpilogue::WarpTileIterator, + typename DefaultEpilogue::SharedLoadIterator, + EpilogueOutputOp, + typename DefaultEpilogue::Padding, + DefaultEpilogue::kFragmentsPerIteration, + true, // IterationsUnroll + typename MM1::OutputTileIteratorAccum // source tile + >; + auto dest_iter = createOutputIter(0); + EpilogueOutputOp rescale(s_prime, out_rescale); + Epilogue epilogue( + shared_storage.epilogue_shared_storage(), + thread_id(), + warp_id(), + lane_id()); + epilogue(rescale, dest_iter, accum_o); + } + + // 7. Calculate logsumexp + // To make the backward easier, we pad logsumexp with `inf` + // this avoids a few bound checks, and is not more expensive during fwd + static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, ""); + if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) { + auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE; + constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E + if (thread_id() < p.num_queries) { + // We set fully masked out rows to 0, the sumexp for masked out rows will be 0 + // We update it to be 1 prior to calling log so that log(1) = 0 + s_prime[thread_id()] = (s_prime[thread_id()] == 0) ? 1: s_prime[thread_id()]; + mi[thread_id()] = (mi[thread_id()] == -cutlass::platform::numeric_limits::infinity()) ? 0: mi[thread_id()]; + p.logsumexp_ptr[thread_id()] = accum_t(mi[thread_id()] / kLog2e) + + cutlass::fast_log(accum_t(s_prime[thread_id()])); + } else if (thread_id() < lse_dim) { + p.logsumexp_ptr[thread_id()] = + cutlass::platform::numeric_limits::infinity(); + } + } + } + + template + CUTLASS_DEVICE static void iterative_softmax( + typename WarpIteratorC::Fragment& frag_o, // output so far + typename WarpIteratorC::Fragment& frag, + cutlass::Array& mi, + cutlass::Array& m_prime, + cutlass::Array& s_prime, + cutlass::Array& out_rescale, + cutlass::Array& + addition_storage, + int8_t lane_id, + int8_t thread_id, + int8_t warp_id, + int max_col, + bool is_first, + typename WarpIteratorC::TensorCoord const& tile_offset, + float scaling) { + /* Iterates on the accumulator and corresponding position on result matrix + + (1) Update `mi[r]` to the max value of the row `r` + (2) In a second iteration do the following: + (a) accum <- exp(accum - mi) + (b) m_prime <- exp(m_prime - mi) + (c) s_prime <- s_prime * m_prime + sum(accum) + + All of this is done on registers, before we store all of this + on shared memory for the next matmul with Value. + */ + using Fragment = typename WarpIteratorC::Fragment; + using LambdaIterator = typename DefaultMmaAccumLambdaIterator< + WarpIteratorC, + accum_t, + kWarpSize>::Iterator; + // Convert to `accum_t` (rather than double) + constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E + + static_assert(kQueriesPerBlock % kNumWarpsPerBlock == 0, ""); + static constexpr int kLinesPerWarp = kQueriesPerBlock / kNumWarpsPerBlock; + + frag = cutlass::multiplies()(scaling * kLog2e, frag); + + auto lane_offset = + LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset); + + // First update `mi` to the max per-row + { + accum_t max; + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { + max = -cutlass::platform::numeric_limits::infinity(); + }, + [&](int accum_m, int accum_n, int idx) { + if (accum_n < max_col) { + max = cutlass::fast_max(max, frag[idx]); + } + }, + [&](int accum_m) { + // Having 4x atomicMax seems faster than reduce within warp + // first... + atomicMaxFloat(&mi[accum_m], max); + }); + } + + // Make sure we all share the update values for `mi` + __syncthreads(); + + // Doing this `exp` is quite expensive. Let's + // split it across the warps + bool restore_mi_to_minus_inf = false; + if (lane_id < kLinesPerWarp) { + int id = warp_id * kLinesPerWarp + lane_id; + auto m_prime_id = m_prime[id]; + auto mi_id = mi[id]; + bool changed = m_prime_id < mi_id; // `false` if both are -inf + if (changed) { + auto m_prime_exp = exp2f(m_prime_id - mi_id); + out_rescale[id] = m_prime_exp; + s_prime[id] *= m_prime_exp; + } else { + // Only when bias is enabled, it's possible that all the first values + // of attention are masked to `-inf`. In that case we want to avoid + // `nan = exp2f(-inf - (-inf))` so we temporarily set `mi` to 0 + if (kSupportsBias && + mi_id == -cutlass::platform::numeric_limits::infinity()) { + restore_mi_to_minus_inf = true; + mi[id] = 0.0f; + } + out_rescale[id] = 1.0f; + } + } + __syncthreads(); // Update output fragments + if (kKeepOutputInRF && !is_first) { + accum_t line_rescale; + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { line_rescale = out_rescale[accum_m]; }, + [&](int accum_m, int accum_n, int idx) { + frag_o[idx] = frag_o[idx] * line_rescale; + }, + [&](int accum_m) {}); + } + // Update accum_m, accum_n, ... + { + accum_t mi_row, total_row; + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { mi_row = mi[accum_m]; }, + [&](int accum_m, int accum_n, int idx) { + frag[idx] = + (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0); + }, + [&](int accum_m) {}); + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { total_row = 0.0; }, + [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; }, + [&](int accum_m) { + if (LambdaIterator::reduceSameRow( + lane_id, total_row, [](accum_t a, accum_t b) { + return a + b; + })) { + // NOTE: we could atomically add `total_row` to `s_prime`, but + // it's faster (and deterministic) to avoid atomics here + addition_storage + [accum_m + kQueriesPerBlock * tile_offset.column()] = + total_row; + } + }); + } + __syncthreads(); + if (lane_id < kLinesPerWarp) { + int id = warp_id * kLinesPerWarp + lane_id; + accum_t total_row = s_prime[id]; + if (restore_mi_to_minus_inf) { + // Restore `mi`, see above when we set `restore_mi_to_minus_inf=true` + mi[id] = -cutlass::platform::numeric_limits::infinity(); + } else { + m_prime[id] = mi[id]; + } + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) { + total_row += addition_storage[id + kQueriesPerBlock * i]; + } + s_prime[id] = total_row; + } + } + + static CUTLASS_DEVICE int8_t lane_id() { + return threadIdx.x; + } + static CUTLASS_DEVICE int8_t warp_id() { + return threadIdx.y; + } + static CUTLASS_DEVICE int16_t thread_id() { + return threadIdx.x + threadIdx.y * blockDim.x; + } +}; + +template +__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) + attention_kernel_batched_impl(typename AK::Params p) { + if (!p.advance_to_block()) { + return; + } + AK::attention_kernel(p); +} + +template +__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) + attention_kernel_batched(typename AK::Params params); + +} // namespace PyTorchMemEffAttention + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..0bc1d1b36a1014d4dc8db3c777189f342ffe54fc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include + +#include +#include + + +template +struct CutlassToAtenDtype; + +template <> +struct CutlassToAtenDtype { + using scalar_t = cutlass::half_t; + + static constexpr __host__ at::ScalarType atScalarType() { + return at::ScalarType::Half; + } +}; + +template <> +struct CutlassToAtenDtype { + using scalar_t = cutlass::bfloat16_t; + + static constexpr __host__ at::ScalarType atScalarType() { + return at::ScalarType::BFloat16; + } +}; + +template <> +struct CutlassToAtenDtype { + using scalar_t = float; + + static constexpr __host__ at::ScalarType atScalarType() { + return at::ScalarType::Float; + } +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/sdp_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/sdp_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..29248fe216b89a88359e6386cdc46caf1d3b7a33 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/sdp_utils.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace sdp { + +bool check_for_seq_len_1_nested_tensor(sdp_params const& params, bool debug); +SDPBackend select_sdp_backend(sdp_params const& kernel_params); +C10_EXPORT bool is_flash_attention_available(); +C10_EXPORT bool can_use_flash_attention(sdp_params const& params, bool debug); +C10_EXPORT bool can_use_mem_efficient_attention(sdp_params const& params, bool debug); +C10_EXPORT bool can_use_cudnn_attention(sdp_params const& params, bool debug); + +} // namespace sdp + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_adapter.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..ea7959d0ff81a48be7e71c1878fe9200ca7506df --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_adapter.h @@ -0,0 +1,190 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_ROCM + +// Expect to be included after headers of at::zeros_like and at::empty_like + +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// +// Common macros copied from cuda/mem_eff_attention/gemm_kernel_utils.h +//////////////////////////////////////////////////////////////////////////////// + +namespace sdp { + +namespace aotriton_adapter { + +inline aotriton::DType cast_dtype(caffe2::TypeMeta t_dtype) +{ +#define CAST_TYPE(aname, dtname) if (t_dtype == at::aname) return aotriton::DType::dtname + CAST_TYPE(kByte, kUInt8); + CAST_TYPE(kUInt16, kUInt16); + CAST_TYPE(kUInt32, kUInt32); + CAST_TYPE(kUInt64, kUInt64); + CAST_TYPE(kChar, kInt8); + CAST_TYPE(kShort, kInt16); + CAST_TYPE(kInt, kInt32); + CAST_TYPE(kLong, kInt64); + CAST_TYPE(kHalf, kFloat16); + CAST_TYPE(kFloat, kFloat32); + CAST_TYPE(kBFloat16, kBFloat16); + return aotriton::DType::kUnknown; +#undef CAST_TYPE +} + +template +struct IntArrayRefCaster { + // std::array cast(IntArrayRef); +}; + +template +struct IntArrayRefCaster { + static auto cast(at::IntArrayRef ref) { + return std::array{{ static_cast(ref.at(0)) }}; + } +}; + +template +struct IntArrayRefCaster { + static auto cast(at::IntArrayRef ref) { + return std::array{{ + static_cast(ref.at(0)), + static_cast(ref.at(1)) + }}; + } +}; + +template +struct IntArrayRefCaster { + static auto cast(at::IntArrayRef ref) { + return std::array{{ + static_cast(ref.at(0)), + static_cast(ref.at(1)), + static_cast(ref.at(2)) + }}; + } +}; + +template +struct IntArrayRefCaster { + static auto cast(at::IntArrayRef ref) { + return std::array{{ + static_cast(ref.at(0)), + static_cast(ref.at(1)), + static_cast(ref.at(2)), + static_cast(ref.at(3)) + }}; + } +}; + + +template +aotriton::TensorView mk_aotensor(const at::Tensor& q, std::string_view tensor_name) +{ + const auto strides = q.strides(); + int real_rank = strides.size(); + if (real_rank != Rank) { // Lazy conversion of tensor_name + TORCH_CHECK(false, + std::string(tensor_name) + "'s rank should be " + std::to_string(Rank) + + " but is " + std::to_string(real_rank)); + } + return aotriton::TensorView(reinterpret_cast(q.data_ptr()), + IntArrayRefCaster::cast(q.sizes()), + IntArrayRefCaster::cast(strides), + cast_dtype(q.dtype())); +} + +inline aotriton::TensorView<0> mk_aoscalartensor(const at::Tensor& q) +{ + return aotriton::TensorView<0>(reinterpret_cast(q.data_ptr()), + cast_dtype(q.dtype())); +} + +inline aotriton::TensorView<0> mk_philoxtensor(const int64_t* ptr) +{ + return aotriton::TensorView<0>(reinterpret_cast(ptr), + aotriton::DType::kUInt64); // AOTriton accepts unsigned int64 +} + +inline aotriton::TensorView<0> mk_atomictensor(const int32_t* ptr) +{ + return aotriton::TensorView<0>(reinterpret_cast(ptr), + aotriton::DType::kInt32); +} + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11) + +struct LazyTensorContext { + at::Tensor like_tensor; + std::string_view tensor_name; + at::Tensor tensor; +}; + +template +struct LazyTensorFunctions : public LazyTensorContext { + static aotriton::TensorView acquire(void* cookie) { + auto ctx = (LazyTensorContext*)cookie; + if (!ctx->tensor.defined()) { + auto q = ctx->like_tensor; + if constexpr (kRequireZeros) { + ctx->tensor = at::zeros(q.sizes(), + q.options().dtype(at::kFloat)); + } else { + ctx->tensor = at::empty_like(q); + } + } + return mk_aotensor(ctx->tensor, ctx->tensor_name); + } + + static void dispose(void* cookie) { + } +}; + +template +aotriton::LazyTensor mklazy_common(LazyTensorContext* cookie) +{ + using LTF = LazyTensorFunctions; + return aotriton::LazyTensor { + .cookie = cookie, + .acquire = <F::acquire, + .dispose = <F::dispose + }; +} + +template +auto mklazy_empty_like(LazyTensorContext* cookie) +{ + return mklazy_common(cookie); +} + + +// Note: this will not keep the original strides +template +auto mklazy_fp32zeros(LazyTensorContext* cookie) +{ + return mklazy_common(cookie); +} + +#endif // >= 0.11 + +} // namespace aotriton_adapter + +} // namespace sdp + +namespace at::native { + +inline int64_t ceil_div(int64_t numerator, int64_t denominator) { + return (numerator + (denominator - 1)) / denominator; +} + +} + +#endif // USE_ROCM + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_versions.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_versions.h new file mode 100644 index 0000000000000000000000000000000000000000..c284695f7a503789fb0843725ac5d21da0cb82ae --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_versions.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_ROCM + +#define AOTRITON_VERSION_INT(x, y) (x * 100 + y) +#define AOTRITON_VERSION_CURRENT (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR) + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11) +#define AOTRITON_ALWAYS_V3_API 1 +#else +#define AOTRITON_ALWAYS_V3_API 0 +#endif + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 10) +#define AOTRITON_V3_API 1 +#else +#define AOTRITON_V3_API 0 +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/gemm_kernel_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/gemm_kernel_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..c1ef5d0ce501e14c665cc767fa51400938c01ec9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/gemm_kernel_utils.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// This file is a trimmed version of cuda/mem_eff_attention/gemm_kernel_utils.h +#pragma once + +#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK(TENSOR.is_contiguous()); + +#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK( \ + TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous"); + +#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \ + TORCH_CHECK( \ + uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned") + +#define ASSIGN_CHECK_OVERFLOW(A, B) \ + { \ + A = B; \ + TORCH_CHECK( \ + B < std::numeric_limits::max(), #B " overflows"); \ + } + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f0d1ee26d31ec437ed2d66b6f8eedfb25277736d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils.h @@ -0,0 +1,93 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +void alloc_with_matching_layout( + const Tensor& q, + Tensor& output, + const std::vector& shape) { + TORCH_INTERNAL_ASSERT( + shape.size() == q.sizes().size(), + "SDPA alloc_with_matching_layout got requested shape ndim != q ndim"); + + if (std::equal(q.sizes().begin(), q.sizes().end(), shape.begin())) { + output = at::empty_like(q); + return; + } + + // get the "fill order," which is just an argsort on the strides + std::vector fill_order(shape.size()); + std::iota(fill_order.begin(), fill_order.end(), 0); + const auto q_strides = q.strides(); + std::stable_sort( + fill_order.begin(), fill_order.end(), [&q_strides](int idx1, int idx2) { + return q_strides[idx1] ? q_strides[idx1] : 1 < q_strides[idx2] ? q_strides[idx2] : 1; + }); + std::vector ordered_strides(shape.size()); + int64_t current_stride = 1; + for (const int dim_idx : fill_order) { + ordered_strides[dim_idx] = current_stride; + current_stride *= shape[dim_idx]; + } + output = at::empty(at::IntArrayRef(shape), q.options()) + .as_strided( + at::IntArrayRef(shape), at::IntArrayRef(ordered_strides), 0); +} + +void permute_to_matching_layout(const Tensor& output, Tensor& grad_output) { + const int dims = output.sizes().size(); + std::vector outer_to_inner(dims); + std::iota(outer_to_inner.begin(), outer_to_inner.end(), 0); + const auto o_strides = output.strides(); + std::stable_sort( + outer_to_inner.begin(), + outer_to_inner.end(), + [&o_strides](int idx1, int idx2) { + return o_strides[idx1] > o_strides[idx2]; + }); + std::vector inverse(dims); + for (int d = 0; d < dims; d++) { + inverse[d] = std::find(outer_to_inner.begin(), outer_to_inner.end(), d) - + outer_to_inner.begin(); + } + grad_output = grad_output.permute(at::IntArrayRef(outer_to_inner)) + .contiguous() + .permute(at::IntArrayRef(inverse)); +} + +bool same_strides(const Tensor& t1, const Tensor& t2) { + std::vector t1_strides_no_ones; + std::vector t2_strides_no_ones; + const auto t1strides = t1.strides(); + const auto t2strides = t2.strides(); + const int dim = t1strides.size(); + if (dim != (int)t2strides.size()) { + return false; + } + const auto t1sizes = t1.sizes(); + const auto t2sizes = t2.sizes(); + + // we are going through strides backward here, but if both are backward it's + // comparable + for (int i = 0; i < dim; i++) { + if (t1sizes[i] > 1) { + t1_strides_no_ones.push_back(t1strides[i]); + } + if (t2sizes[i] > 1) { + t2_strides_no_ones.push_back(t2strides[i]); + } + } + return std::equal( + t1_strides_no_ones.begin(), + t1_strides_no_ones.end(), + t2_strides_no_ones.begin(), + t2_strides_no_ones.end()); +} +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h new file mode 100644 index 0000000000000000000000000000000000000000..6ffddcdb008d1a6faf8e31187bc166d6581d7755 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h @@ -0,0 +1,565 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace sdp { + +constexpr int32_t num_backends = at::num_sdp_backends; +using SDPBackend = at::SDPBackend; + +// Note that if this changed make sure to update +// the templated enum in mem_eff/kernel_forward.h and mem_eff/kernel_backward.h +enum class CustomMaskType { + NoCustomMask = 0, + CausalFromTopLeft = 1, + CausalFromBottomRight = 2, + NumCustomMaskTypes, +}; + +struct sdp_params { + at::Tensor query; + at::Tensor key; + at::Tensor value; + std::optional attn_mask; + double dropout; + bool is_causal; + bool enable_gqa; +}; + +SDPBackend select_sdp_backend_cpp(sdp_params const& kernel_params); + +inline c10::SymFloat calculate_scale( + const at::Tensor& query, + std::optional scale) { + const auto softmax_scale = scale.has_value() + ? scale.value() + : (c10::SymFloat(1.0) / (c10::SymFloat(query.sym_size(-1)).sqrt())); + return c10::SymFloat(softmax_scale); +} + +inline bool input_requires_grad(sdp_params const& params) { + const bool any_inputs_require_grad = params.query.requires_grad() || + params.key.requires_grad() || params.value.requires_grad(); + const bool gradmode_enabled = at::GradMode::is_enabled(); + return any_inputs_require_grad && gradmode_enabled; +} + +inline bool has_for_nested_inputs(sdp_params const& params) { + return + (params.query.is_nested() && params.query.layout() == c10::kStrided) || + (params.key.is_nested() && params.key.layout() == c10::kStrided) || + (params.value.is_nested() && params.value.layout() == c10::kStrided); +} + +inline bool has_for_dense_inputs(sdp_params const& params) { + return !params.query.is_nested() || !params.key.is_nested() || !params.value.is_nested(); +} + +inline bool has_only_dense_inputs(sdp_params const& params) { + return !params.query.is_nested() && !params.key.is_nested() && !params.value.is_nested(); +} + +template +inline bool check_tensor_dtype( + sdp_params const& params, + dtype_vector allowed_dtypes, + bool debug) { + auto query_dtype = params.query.dtype(); + if (!(query_dtype == params.key.dtype() && + query_dtype == params.value.dtype() && + (std::find(allowed_dtypes.begin(), allowed_dtypes.end(), query_dtype) != + allowed_dtypes.end()))) { + if (debug) { + TORCH_WARN( + "Expected query, key and value to all be of dtype: {", + c10::Join(", ", allowed_dtypes), + "}. Got ", + "Query dtype: ", + params.query.dtype(), + ", Key dtype: ", + params.key.dtype(), + ", and Value dtype: ", + params.value.dtype(), + " instead."); + } + return false; + } + return true; +} + + +inline bool try_broadcast_param_size( + const c10::SymInt q_size, + const c10::SymInt k_size, + const c10::SymInt v_size, + std::string_view param_name, + bool debug) { + auto max_size = std::max({q_size, k_size, v_size}); + if ((q_size != max_size && q_size != 1) || + (k_size != max_size && k_size != 1) || + (v_size != max_size && v_size != 1)) { + if (debug) { + TORCH_WARN( + "Both fused kernels require query, key and value to have broadcastable ", + param_name, + "got Query ", + param_name, + q_size, + ", Key ", + param_name, + k_size, + ", Value ", + param_name, + v_size, + " instead."); + } + return false; + } + return true; +} + +inline bool check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + at::Tensor const& param, + std::string_view param_name, + bool debug) { + const auto nt_tensor_impl = at::native::get_nested_tensor_impl(param); + const at::Tensor& sizes = nt_tensor_impl->get_nested_sizes(); + auto num_head_dims = nt_tensor_impl->opt_size(1); + if (!num_head_dims.has_value()) { + // num_head_dims is ragged + if (debug) { + TORCH_WARN( + "Fused kernels do not support ragged num_head_dims, ", + param_name, + "has a ragged num_heads."); + } + return false; + } + + auto* sizes_ptr = sizes.data_ptr(); + const int64_t n_tensors = param.size(0); + const int64_t size_tensor_stride = sizes.stride(0); + + // This is being called inside sdp with shape [batch, heads, {seq_len}, dim] + for (const auto i : c10::irange(n_tensors)) { + if (sizes_ptr[(i * size_tensor_stride) + 1] == 0) { + if (debug) { + TORCH_WARN( + "Fused kernels do not support seq_len == 0, ", + param_name, + "has a seq len of 0."); + } + return false; + } + } + return true; +} + +inline bool check_for_seq_len_0_nested_tensor(sdp_params const& params, bool debug) { + // When this function is called we are assured that the nt is dim==4 + bool q_is_safe = params.query.is_nested() + ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + params.query, "query ", debug) + : true; + // short circuit if any is unsafe + if (!q_is_safe) { + return false; + } + + bool k_is_safe = params.key.is_nested() + ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + params.key, "key ", debug) + : true; + if (!k_is_safe) { + return false; + } + + bool v_is_safe = params.value.is_nested() + ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + params.value, "value ", debug) + : true; + if (!v_is_safe) { + return false; + } + + // We now know none of the inputs have ragged num_heads, so we can safely + // access .size(1) + auto q_num_heads = params.query.size(1); + auto k_num_heads = params.key.size(1); + auto v_num_heads = params.value.size(1); + bool same_num_heads = + q_num_heads == k_num_heads && q_num_heads == v_num_heads; + + if (!same_num_heads) { + if (input_requires_grad(params)){ + if (debug) { + TORCH_WARN( + "Both fused kernels do not support training with broadcasted NT inputs."); + } + return false; + } + return try_broadcast_param_size( + q_num_heads, k_num_heads, v_num_heads, "num heads ", debug); + } + + return true; +} + +inline bool check_nested_tensor(sdp_params const& params, bool debug) { + // Return false if have nested tensor + if (!has_only_dense_inputs(params)) { + if (debug) { + TORCH_WARN( + "Both fused kernels of cpp version currently do not support Nested Tensor inputs."); + } + return false; + } + return true; +} + +inline bool check_for_dropout(sdp_params const& params, bool debug) { + if (params.dropout > 0.0) { + if (debug) { + TORCH_WARN("Both fused kernels do not support non-zero dropout."); + } + return false; + } + return true; +} + +inline bool check_requires_grad_and_nested(sdp_params const& params, bool debug) { + if (input_requires_grad(params)) { + if (debug) { + TORCH_WARN( + "Memory efficient attention currently doesn't support training with NT inputs."); + } + return false; + } + return true; +} + +inline bool check_for_attn_mask(sdp_params const& params, bool debug) { + if (params.attn_mask.has_value()) { + if (debug) { + TORCH_WARN("Flash Attention does not support non-null attn_mask."); + } + return false; + } + return true; +} + +inline bool check_attn_mask_shape(sdp_params const& params, bool debug) { + auto attn_mask = params.attn_mask; + if (!attn_mask.has_value()) { + return true; + } + if (attn_mask.value().requires_grad()) { + return false; + } + auto batchSize = params.query.sym_size(0); + auto qSize = params.query.sym_size(2); + auto kvSize = params.key.sym_size(2); + auto num_head = params.query.sym_size(1); + if (attn_mask.value().sym_size(-2) != qSize && attn_mask.value().sym_size(-2) != 1) { + return false; + } + if (attn_mask.value().sym_size(-1) != kvSize && attn_mask.value().sym_size(-1) != 1) { + return false; + } + if (attn_mask.value().dim() == 2) { + return true; + } else if (attn_mask.value().dim() == 4) { + if ((attn_mask.value().sym_size(0) == 1 || attn_mask.value().sym_size(0) == batchSize) + && (attn_mask.value().sym_size(1) == 1 || attn_mask.value().sym_size(1) == num_head)) { + return true; + } + } + if (debug) { + TORCH_WARN("Please use the following attn mask shapes: ", + "2d - ({Q_seq_len, 1} x {KV_seq_len, 1}); ", + "4d - ({Batch, 1} x {Num_heads, 1} x {Q_seq_len, 1} x {KV_seq_len, 1})"); + } + return false; +} + +inline bool check_tensor_shapes(sdp_params const& params, bool debug) { + auto query_dim = params.query.dim(); + if (!(query_dim == params.key.dim() && query_dim == params.value.dim() && + (query_dim == 4))) { + if (debug) { + TORCH_WARN( + "All fused kernels requires query, key and value to be 4 dimensional, but got Query dim: ", + query_dim, + ", Key dim: ", + params.key.dim(), + ", Value dim: ", + params.value.dim(), + " instead."); + } + return false; + } + return true; +} + +inline bool check_safe_kv_broadcast(at::Tensor const& param, bool debug) { + const auto nt_tensor_impl = at::native::get_nested_tensor_impl(param); + auto seq_len = nt_tensor_impl->opt_size(2); + if (!seq_len.has_value()) { + if (debug) { + TORCH_WARN( + "For both fused kernels, if one of key/value batch_size requires " + "broadcasting and the other does not, then the other must have a ", + "consistent seq_len dim.") + } + return false; + } + return true; +} + +template +inline bool check_grouped_query_attention(sdp_params const& params, bool debug) { + const auto q_num_heads = params.query.sym_size(-3); + const auto k_num_heads = params.key.sym_size(-3); + const auto v_num_heads = params.value.sym_size(-3); + const bool same_kv_heads = k_num_heads == v_num_heads; + + if (requires_same_num_heads && !same_kv_heads){ + if (debug) { + TORCH_WARN( + "Both fused kernels require key and value to have the same num_heads and batch_size but got: ", + "Key sizes: ", + params.key.sizes(), + ", Value sizes: ", + params.value.sizes(), + ", Query sizes: ", + params.query.sizes(), + " instead."); + } + return false; + } + // Check if grouped query attention is supported and validate the number of + // heads + if (q_num_heads % k_num_heads != 0 || (!requires_same_num_heads && (q_num_heads % v_num_heads != 0))) { + if (debug) { + TORCH_WARN( + "The number of heads in key/value must divide number of heads in query.", + "Got input Key sizes(): ", + params.key.sym_size(-3), + ", Value sizes(): ", + params.value.sym_size(-3), + ", Query sizes(): ", + params.query.sym_size(-3), + " instead."); + } + return false; + } + return true; +} + +template +inline bool check_batch_size_and_num_heads_dense(sdp_params const& params, bool debug) { + // This is expected to be called after check_tensor_shapes ensuring that the + // size() calls won't error since the inputs are all 4 dimensional + + auto q_batch_size = params.query.sym_size(0); + auto k_batch_size = params.key.sym_size(0); + auto v_batch_size = params.value.sym_size(0); + + bool same_batch_size = + q_batch_size == k_batch_size && q_batch_size == v_batch_size; + + auto q_num_heads = params.query.sym_size(-3); + auto k_num_heads = params.key.sym_size(-3); + auto v_num_heads = params.value.sym_size(-3); + + bool same_num_heads = + q_num_heads == k_num_heads && q_num_heads == v_num_heads; + + if (!same_batch_size){ + if(debug) { + TORCH_WARN( + "For dense inputs, both fused kernels require query, key and value to have the same batch_size. ", + "Query.sizes(): ", + params.query.sizes(), + ", Key.sizes(): ", + params.key.sizes(), + ", Value.sizes(): ", + params.value.sizes(), + " instead. To broadcast dense inputs, try using unsqueeze and expand_to before passing them into the kernel."); + } + return false; + } + + if(params.enable_gqa && supports_gqa){ + return check_grouped_query_attention(params, debug); + } + + // same num heads condition for non-gqa case + if (!same_num_heads){ + if (debug) { + TORCH_WARN( + "For dense input, both fused kernels require query, key and value to have the same num_heads. ", + "Query.sizes(): ", + params.query.sizes(), + ", Key sizes(): ", + params.key.sizes(), + ", Value sizes(): ", + params.value.sizes(), + " instead. To broadcast dense inputs, try using unsqueeze and expand_to before passing them into the kernel."); + } + return false; + } + // If all checks pass, return true + return true; +} + +inline bool check_batch_size_nested(sdp_params const& params, bool debug) { + // This is expected to be called after check_tensor_shapes ensuring that the + // size() calls won't error since the inputs are all 4 dimensional + auto q_batch_size = params.query.sym_size(0); + auto k_batch_size = params.key.sym_size(0); + auto v_batch_size = params.value.sym_size(0); + + bool same_batch_size = + q_batch_size == k_batch_size && q_batch_size == v_batch_size; + + // num_heads logic for nested input is checked in + // check_for_seq_len_0_nested_tensor as there is handling there to make sure + // num_heads is not ragged + bool broadcastable_batch_size = true; + if (!same_batch_size) { + if (input_requires_grad(params)){ + if (debug) { + TORCH_WARN( + "Both fused kernels do not support training with broadcasted NT inputs."); + } + return false; + } + // try to broadcast batchsize + broadcastable_batch_size = try_broadcast_param_size( + q_batch_size, k_batch_size, v_batch_size, "batch size ", debug); + + // if only one of k or v require broadcasting of batch size, the other + // must have a consistent seq_len dim + if (broadcastable_batch_size) { + if (k_batch_size == 1 && v_batch_size != 1 && + !check_safe_kv_broadcast(params.value, debug)) { + return false; + } + if (v_batch_size == 1 && k_batch_size != 1 && + !check_safe_kv_broadcast(params.key, debug)) { + return false; + } + } + } + return broadcastable_batch_size; +} + +inline bool check_nonzero_sequence_lengths_dense(sdp_params const& params, bool debug) { + // In some cases people will pass in 0 sized tensors, this will + // cause the fused path to error with unaligned mask + bool zero_seq_len_q = params.query.sym_size(-2) == 0; + bool zero_seq_len_k = params.key.sym_size(-2) == 0; + if (zero_seq_len_q || zero_seq_len_k) { + if (debug) { + TORCH_WARN( + "All fused kernels do not support zero seq_len_q or seq_len_kv."); + } + return false; + } + return true; +} + +template +inline bool check_last_dim_stride_equals_1_dense(sdp_params const& params, bool debug) { + // The stride checking for NestedTensors is done within the kernel + // And .contiguous will be called if needed + + // This function checks that the last dimension of the inputs to + // fused_attention have stride 1 + bool qkv_strides_equal_1 = params.query.sym_stride(-1) == 1 && + params.key.sym_stride(-1) == 1 && params.value.sym_stride(-1) == 1; + + // https://github.com/pytorch/pytorch/issues/116333 + // If the head_dim is size 1 the stride won't matter, but we + // check this condition before padding the head_dim to 1 + if (ignore_singleton_dim){ + qkv_strides_equal_1 = qkv_strides_equal_1 || params.query.sym_size(-1) == 1; + } + bool is_cpu = params.query.device().type() == c10::DeviceType::CPU; + bool mask_stride_equal_1 = params.attn_mask.has_value() + ? params.attn_mask.value().sym_stride(-1) == 1 + : true; + bool mask_stride_valid = is_cpu ? true : mask_stride_equal_1; + if (!(qkv_strides_equal_1 && mask_stride_valid)) { + if (debug) { + std::ostringstream message; + message + << "All fused kernels require the last dimension of the input to have stride 1. "; + message << "Got Query.stride(-1): " << params.query.sym_stride(-1) + << ", Key.stride(-1): " << params.key.sym_stride(-1) + << ", Value.stride(-1): " << params.value.sym_stride(-1); + + if (params.attn_mask.has_value()) { + message + << ", Attn_mask.stride(-1): " + << params.attn_mask.value().sym_stride(-1) + << " (GPU backends require attn_mask's last dimension to have stride 1 while the CPU does not)."; + } + TORCH_WARN(message.str()); + } + + return false; + } + return true; +} + +inline bool check_runtime_disabled_flash(sdp_params const& params, bool debug) { + // We check the global context to see if user has explicitly turned of flash + // sdp kernels + if (!at::globalContext().userEnabledFlashSDP()) { + if (debug) { + TORCH_WARN("Flash attention has been runtime disabled."); + } + return false; + } + return true; +} + +inline bool check_runtime_disabled_mem_efficient(sdp_params const& params, bool debug) { + // We check the global context to see if user has explicitly turned of + // mem_efficient sdp kernels + if (!at::globalContext().userEnabledMemEfficientSDP()) { + if (debug) { + TORCH_WARN("Memory Efficient attention has been runtime disabled."); + } + return false; + } + return true; +} + + +} // namespace sdp + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/xpu/sdp_utils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/xpu/sdp_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..b2ddf6d52bc58be0bb901a0dee0375cefc3224a5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/transformers/xpu/sdp_utils.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace sdp { + +C10_EXPORT bool is_flash_attention_available(); +C10_EXPORT bool can_use_flash_attention(sdp_params const& params, bool debug); +C10_EXPORT bool check_flash_attention_hardware_support( + sdp_params const& params, + bool debug); + +} // namespace sdp + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/utils/Factory.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/utils/Factory.h new file mode 100644 index 0000000000000000000000000000000000000000..f338911e70c0d2356d58f340a937d16798a65fec --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/utils/Factory.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native::mobile { + +Tensor allocate_padded_contiguous_if_needed( + const Tensor& input, + c10::MemoryFormat memory_format); + +// TODO: Remove this function when at::native::empty() is modified to accept a +// custom memory allocator. + +at::Tensor empty_with_tail_padding( + IntArrayRef size, + const caffe2::TypeMeta dtype, + c10::MemoryFormat memory_format, + std::optional maybe_names); + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..2d8c66b5e99d8b40b69363ecfa6324247a47454c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamUtils.h @@ -0,0 +1,47 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at { +namespace native { + +template +inline std::vector _expand_param_if_needed( + ArrayRef list_param, + const char* param_name, + int64_t expected_dim) { + if (list_param.size() == 1) { + return std::vector(expected_dim, list_param[0]); + } else if ((int64_t)list_param.size() != expected_dim) { + std::ostringstream ss; + ss << "expected " << param_name << " to be a single integer value or a " + << "list of " << expected_dim << " values to match the convolution " + << "dimensions, but got " << param_name << '=' << list_param; + TORCH_CHECK(false, ss.str()); + } else { + return list_param.vec(); + } +} + +inline std::vector expand_param_if_needed( + IntArrayRef list_param, + const char* param_name, + int64_t expected_dim) { + return _expand_param_if_needed(list_param, param_name, expected_dim); +} + +inline std::vector expand_param_if_needed( + SymIntArrayRef list_param, + const char* param_name, + int64_t expected_dim) { + return _expand_param_if_needed(list_param, param_name, expected_dim); +} + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamsHash.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamsHash.h new file mode 100644 index 0000000000000000000000000000000000000000..46c05e706c9a9d230e6bafadb4d3caa82241c43c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamsHash.h @@ -0,0 +1,109 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +// Hashing machinery for Params +// Fowler–Noll–Vo hash function +// see +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +template +struct ParamsHash { + // Params must be a POD because we read out its memory + // contents as char* when hashing + static_assert(std::is_standard_layout_v, "Params is not POD"); + + size_t operator()(const Params& params) const { + auto ptr = reinterpret_cast(¶ms); + uint32_t value = 0x811C9DC5; + for (const auto i : c10::irange(sizeof(Params))) { + value ^= ptr[i]; + value *= 0x01000193; + } + return (size_t)value; + } +}; + +template +struct ParamsEqual { + // Params must be a POD because we read out its memory + // contents as char* when comparing + static_assert(std::is_standard_layout_v, "Params is not POD"); + + bool operator()(const Params& a, const Params& b) const { + auto ptr1 = reinterpret_cast(&a); + auto ptr2 = reinterpret_cast(&b); + return memcmp(ptr1, ptr2, sizeof(Params)) == 0; + } +}; + +// Provide explicit byte-for-byte constructors to avoid uwittingly leaving +// padding bytes uninitialized (e.g., when passing Params by value) +template +struct ParamsWrapper { + T pod; + static_assert( + std::is_standard_layout_v, + "ParamsWrapper cannot wrap non-POD data"); + + ParamsWrapper() { + memset(&(this->pod), 0, sizeof(this->pod)); + } + + ParamsWrapper(const ParamsWrapper& other) { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + } + + ParamsWrapper(ParamsWrapper&& other) noexcept { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + } + + ParamsWrapper& operator=(const ParamsWrapper& other) { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + return *this; + } + + ParamsWrapper& operator=(ParamsWrapper&& other) noexcept { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + return *this; + } + + inline friend bool operator==( + const ParamsWrapper& lhs, + const ParamsWrapper& rhs) noexcept { + auto ptr1 = reinterpret_cast(&(lhs.pod)); + auto ptr2 = reinterpret_cast(&(rhs.pod)); + return memcmp(ptr1, ptr2, sizeof(lhs.pod)) == 0; + } +}; + +// Wrapped version: this allows the outer struct to have custom copy and move +// constructors for additional safety +template +struct ParamsWrapperHash { + // Params must be a POD because we read out its memory + // contents as char* when hashing + static_assert( + std::is_standard_layout_v, + "ParamsWrapper cannot wrap non-POD data"); + + size_t operator()(const ParamsWrapper& params_wrapper) const { + auto ptr = reinterpret_cast(&(params_wrapper.pod)); + uint32_t value = 0x811C9DC5; + for (const auto i : c10::irange(sizeof(params_wrapper.pod))) { + value ^= ptr[i]; + value *= 0x01000193; + } + return (size_t)value; + } +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/verbose_wrapper.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/verbose_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..bc144b9f92cc6441074485888f34cb4c44a683de --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/verbose_wrapper.h @@ -0,0 +1,13 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace torch::verbose { +TORCH_API int _mkl_set_verbose(int enable); +TORCH_API int _mkldnn_set_verbose(int level); +} // namespace torch::verbose + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/vol2col.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/vol2col.h new file mode 100644 index 0000000000000000000000000000000000000000..454e468ab35edf7fed2d0b28d898a8add76b8164 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/native/vol2col.h @@ -0,0 +1,114 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +template +void vol2col( + const T* data_vol, + const int64_t channels, + const int64_t depth, + const int64_t height, + const int64_t width, + const int64_t depth_col, + const int64_t height_col, + const int64_t width_col, + const int64_t kT, + const int64_t kernel_height, + const int64_t kernel_width, + const int64_t pT, + const int64_t pH, + const int64_t pW, + const int64_t dT, + const int64_t dH, + const int64_t dW, + const int64_t dilationT, + const int64_t dilationH, + const int64_t dilationW, + T* data_col) { + int64_t c, t, h, w; + int64_t channels_col = channels * kT * kernel_height * kernel_width; + for (c = 0; c < channels_col; ++c) { + int64_t w_offset = c % kernel_width; + int64_t h_offset = (c / kernel_width) % kernel_height; + int64_t t_offset = (c / kernel_width / kernel_height) % kT; + int64_t c_vol = c / kT / kernel_height / kernel_width; + for (t = 0; t < depth_col; ++t) { + int64_t t_pad = t * dT - pT + t_offset * dilationT; + for (h = 0; h < height_col; ++h) { + int64_t h_pad = h * dH - pH + h_offset * dilationH; + for (w = 0; w < width_col; ++w) { + int64_t w_pad = w * dW - pW + w_offset * dilationW; + if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && + w_pad >= 0 && w_pad < width) + data_col[((c * depth_col + t) * height_col + h) * width_col + w] = + data_vol + [((c_vol * depth + t_pad) * height + h_pad) * width + + w_pad]; + else + data_col[((c * depth_col + t) * height_col + h) * width_col + w] = + 0; + } + } + } + } +} + +template +void col2vol( + const T* data_col, + const int64_t channels, + const int64_t depth, + const int64_t height, + const int64_t width, + const int64_t out_depth, + const int64_t out_height, + const int64_t out_width, + const int64_t kT, + const int64_t kernel_height, + const int64_t kernel_width, + const int64_t pT, + const int64_t pH, + const int64_t pW, + const int64_t dT, + const int64_t dH, + const int64_t dW, + const int64_t dilationT, + const int64_t dilationH, + const int64_t dilationW, + T* data_vol) { + memset(data_vol, 0, sizeof(T) * depth * height * width * channels); + int64_t depth_col = out_depth; + int64_t height_col = out_height; + int64_t width_col = out_width; + int64_t channels_col = channels * kT * kernel_height * kernel_width; + for (int64_t c = 0; c < channels_col; ++c) { + int64_t w_offset = c % kernel_width; + int64_t h_offset = (c / kernel_width) % kernel_height; + int64_t t_offset = (c / kernel_width / kernel_height) % kT; + int64_t c_vol = c / kT / kernel_height / kernel_width; + for (int64_t t = 0; t < depth_col; ++t) { + int64_t t_pad = t * dT - pT + t_offset * dilationT; + for (int64_t h = 0; h < height_col; ++h) { + int64_t h_pad = h * dH - pH + h_offset * dilationH; + for (int64_t w = 0; w < width_col; ++w) { + int64_t w_pad = w * dW - pW + w_offset * dilationW; + if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && + w_pad >= 0 && w_pad < width) + data_vol + [((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] += + data_col + [((c * depth_col + t) * height_col + h) * width_col + w]; + } + } + } + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d.h new file mode 100644 index 0000000000000000000000000000000000000000..d68fcec634a241df67faa94606c635d300922a98 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor +inline at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size)); +} +namespace symint { + template >> + at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size)); + } +} + +// aten::_adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor +inline at::Tensor _adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d::call(self, output_size); +} +namespace symint { + template >> + at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d::call(self, output_size); + } +} + +// aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); +} +namespace symint { + template >> + at::Tensor & _adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); + } +} + +// aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); +} +namespace symint { + template >> + at::Tensor & _adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); + } +} + +// aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, output_size, out); +} +namespace symint { + template >> + at::Tensor & _adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, output_size, out); + } +} + +// aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, output_size, out); +} +namespace symint { + template >> + at::Tensor & _adaptive_avg_pool2d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, output_size, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a20c52c189212f6de6cd76eb5dd8d4f850e5a38a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _adaptive_avg_pool2d_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self); +TORCH_API at::Tensor & _adaptive_avg_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d901f635b8d97afd4b4f66e6571642a5cb2b7109 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size); +TORCH_API at::Tensor _adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool3d_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool3d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ca8c51742a592ecba293b870cae795c8b41b4ea4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool3d_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _adaptive_avg_pool3d(const at::Tensor & self, at::IntArrayRef output_size); +TORCH_API at::Tensor _adaptive_avg_pool3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..50d7cce05d2a82eefae74722676792a760103dc7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured__addmm_activation : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_native.h new file mode 100644 index 0000000000000000000000000000000000000000..42118c217630a793e3dc99db81ced73f4cec6f65 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_addmm_activation_out_cpu : public at::meta::structured__addmm_activation { +void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu, const at::Tensor & out); +}; +struct TORCH_API structured_addmm_activation_out_cuda : public at::meta::structured__addmm_activation { +void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale.h new file mode 100644 index 0000000000000000000000000000000000000000..d99cfde46f1f4850e4a4815ac71c18cc06762be9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!) +inline at::Tensor & _amp_update_scale_(at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) { + return at::_ops::_amp_update_scale_::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval); +} + +// aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _amp_update_scale_out(at::Tensor & out, const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) { + return at::_ops::_amp_update_scale_out::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out); +} +// aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _amp_update_scale_outf(const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, at::Tensor & out) { + return at::_ops::_amp_update_scale_out::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out); +} + +// aten::_amp_update_scale(Tensor self, Tensor growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> (Tensor, Tensor growth_tracker_out) +inline ::std::tuple _amp_update_scale(const at::Tensor & self, const at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) { + return at::_ops::_amp_update_scale::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1f4121095527699fc19e2ad629c3e0d1a4e4bcfd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_update_scale_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _amp_update_scale(const at::Tensor & self, const at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval); +TORCH_API at::Tensor & _amp_update_scale_out(const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, at::Tensor & out); +TORCH_API at::Tensor & _amp_update_scale_cpu_(at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval); +TORCH_API at::Tensor & _amp_update_scale_cuda_(at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_assert_async_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_assert_async_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..226e4a3efcf0b73534e03921bb143046692963ed --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_assert_async_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API void _assert_async(const at::Tensor & self); +TORCH_API void _assert_async(const at::Tensor & self, c10::string_view assert_msg); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_full_precision.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_full_precision.h new file mode 100644 index 0000000000000000000000000000000000000000..0cebc6cce88b004f8d9428fcbb1287974ba3f098 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_full_precision.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_full_precision_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_full_precision_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f505b7df76a96d7fa7dd1c9b81ee840e019fb91c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_full_precision_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _autocast_to_full_precision { + using schema = at::Tensor (const at::Tensor &, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_autocast_to_full_precision"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_autocast_to_full_precision(Tensor(a) self, bool cuda_enabled, bool cpu_enabled) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, bool cuda_enabled, bool cpu_enabled); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool cuda_enabled, bool cpu_enabled); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision.h new file mode 100644 index 0000000000000000000000000000000000000000..49cf9563e00deab38c11abfb68d9b81a5c9fa885 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2582855d91d96152c2428f0e56d519ed5bbe0b08 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _autocast_to_reduced_precision(const at::Tensor & self, bool cuda_enabled, bool cpu_enabled, at::ScalarType cuda_dtype, at::ScalarType cpu_dtype); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f41b6017fab3eb8277c60bfbe4a4e4f1309a28e3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_backward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _backward { + using schema = void (const at::Tensor &, at::TensorList, const ::std::optional &, ::std::optional, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()"; + static void call(const at::Tensor & self, at::TensorList inputs, const ::std::optional & gradient, ::std::optional retain_graph, bool create_graph); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList inputs, const ::std::optional & gradient, ::std::optional retain_graph, bool create_graph); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_impl_index_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_impl_index_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..179e583e27f877b69011dbaea0a36604081a8328 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_impl_index_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API ::std::tuple _batch_norm_impl_index(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, bool training, double momentum, double eps, bool cudnn_enabled); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6e6a35527c9586110e424afbd11fc95e83c3a155 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple _batch_norm_with_update_functional(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..29338eceb392f0b212d418120deefb9931fc9f2f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _batch_norm_with_update { + using schema = ::std::tuple (const at::Tensor &, const ::std::optional &, const ::std::optional &, at::Tensor &, at::Tensor &, double, double); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_batch_norm_with_update"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps); +}; + +struct TORCH_API _batch_norm_with_update_out { + using schema = ::std::tuple (const at::Tensor &, const ::std::optional &, const ::std::optional &, at::Tensor &, at::Tensor &, double, double, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_batch_norm_with_update"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))"; + static ::std::tuple call(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, at::Tensor & reserve); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, at::Tensor & reserve); +}; + +struct TORCH_API _batch_norm_with_update_functional { + using schema = ::std::tuple (const at::Tensor &, const ::std::optional &, const ::std::optional &, const at::Tensor &, const at::Tensor &, double, double); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_batch_norm_with_update_functional"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_batch_norm_with_update_functional(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor, Tensor running_mean_out, Tensor running_var_out)"; + static ::std::tuple call(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Byte_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Byte_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a83af4d780df86d4a6f8b69184170eb6752c63c6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Byte_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _cast_Byte(const at::Tensor & self, bool non_blocking=false); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half.h new file mode 100644 index 0000000000000000000000000000000000000000..53eec94f51c84761eebeeeeeb0e7f624bbb07fb1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_cast_Half(Tensor self, bool non_blocking=False) -> Tensor +inline at::Tensor _cast_Half(const at::Tensor & self, bool non_blocking=false) { + return at::_ops::_cast_Half::call(self, non_blocking); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..dc20a697e6fb4fbb749eaca4c04de45050f5cb5a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _cast_Half(const at::Tensor & self, bool non_blocking=false); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cdist_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cdist_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f9b80465aabf84f59fe9dd9074cfe1c0eaa1011f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cdist_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _cdist_backward_out(const at::Tensor & grad, const at::Tensor & x1, const at::Tensor & x2, double p, const at::Tensor & cdist, at::Tensor & out); +TORCH_API at::Tensor _cdist_backward(const at::Tensor & grad, const at::Tensor & x1, const at::Tensor & x2, double p, const at::Tensor & cdist); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_coalesce.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_coalesce.h new file mode 100644 index 0000000000000000000000000000000000000000..91c3009bf8a8d507bc56d71a39ebff0c033c634f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_coalesce.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_coalesce(Tensor self) -> Tensor +inline at::Tensor _coalesce(const at::Tensor & self) { + return at::_ops::_coalesce::call(self); +} + +// aten::_coalesce.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _coalesce_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::_coalesce_out::call(self, out); +} +// aten::_coalesce.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _coalesce_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::_coalesce_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_compute_linear_combination.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_compute_linear_combination.h new file mode 100644 index 0000000000000000000000000000000000000000..75598c326840e418fa10703634d8a184f92b910f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_compute_linear_combination.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor +inline at::Tensor _compute_linear_combination(const at::Tensor & input, const at::Tensor & coefficients) { + return at::_ops::_compute_linear_combination::call(input, coefficients); +} + +// aten::_compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _compute_linear_combination_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & coefficients) { + return at::_ops::_compute_linear_combination_out::call(input, coefficients, out); +} +// aten::_compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _compute_linear_combination_outf(const at::Tensor & input, const at::Tensor & coefficients, at::Tensor & out) { + return at::_ops::_compute_linear_combination_out::call(input, coefficients, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2741d5ba85db43917f2f809bbd2e99a64647f0e0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _conj { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_conj"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_conj(Tensor(a) self) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_physical_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_physical_native.h new file mode 100644 index 0000000000000000000000000000000000000000..62ccfad6ca11267796894962b8086a1a605ecf98 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_physical_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _conj_physical(const at::Tensor & self); +TORCH_API at::Tensor & _conj_physical_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor conj_physical_sparse_csr(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_indices_from_coo_to_csr_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_indices_from_coo_to_csr_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..03984750d67e6107a376d38216fd3936a3d79192 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_indices_from_coo_to_csr_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor _convert_indices_from_coo_to_csr(const at::Tensor & self, int64_t size, bool out_int32=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution.h new file mode 100644 index 0000000000000000000000000000000000000000..00139d482f9f12248dc55b91947a498dc8025d3d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution.h @@ -0,0 +1,119 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor +inline at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32); +} +namespace symint { + template >> + at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32); + } +} + +// aten::_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor +inline at::Tensor _convolution_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32); +} +namespace symint { + template >> + at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32); + } +} + +// aten::_convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor +inline at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) { + return at::_ops::_convolution_deprecated::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled); +} +namespace symint { + template >> + at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) { + return at::_ops::_convolution_deprecated::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled); + } +} + +// aten::_convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor +inline at::Tensor _convolution_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) { + return at::_ops::_convolution_deprecated::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled); +} +namespace symint { + template >> + at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) { + return at::_ops::_convolution_deprecated::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled); + } +} + +// aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _convolution_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution_out::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); +} +namespace symint { + template >> + at::Tensor & _convolution_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution_out::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); + } +} + +// aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _convolution_outf(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) { + return at::_ops::_convolution_out::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); +} +namespace symint { + template >> + at::Tensor & _convolution_outf(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) { + return at::_ops::_convolution_out::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); + } +} + +// aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _convolution_symint_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution_out::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); +} +namespace symint { + template >> + at::Tensor & _convolution_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution_out::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); + } +} + +// aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _convolution_symint_outf(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) { + return at::_ops::_convolution_out::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); +} +namespace symint { + template >> + at::Tensor & _convolution_outf(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) { + return at::_ops::_convolution_out::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_double_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_double_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..454c0b2d12778ad7b7337d189653cb44b9327895 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_double_backward.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _convolution_double_backward(const ::std::optional & ggI, const ::std::optional & ggW, const ::std::optional & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array output_mask) { + return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask); +} +namespace symint { + template >> + ::std::tuple _convolution_double_backward(const ::std::optional & ggI, const ::std::optional & ggW, const ::std::optional & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array output_mask) { + return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask); + } +} + +// aten::_convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _convolution_double_backward_symint(const ::std::optional & ggI, const ::std::optional & ggW, const ::std::optional & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array output_mask) { + return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, stride, padding, dilation, transposed, output_padding, groups, output_mask); +} +namespace symint { + template >> + ::std::tuple _convolution_double_backward(const ::std::optional & ggI, const ::std::optional & ggW, const ::std::optional & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array output_mask) { + return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, stride, padding, dilation, transposed, output_padding, groups, output_mask); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8d567662e311c16216640cdaf57b1c4eb1abd8d7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _copy_from { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_copy_from"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & dst, bool non_blocking); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & dst, bool non_blocking); +}; + +struct TORCH_API _copy_from_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_copy_from"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_copy_from.out(Tensor self, Tensor dst, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & dst, bool non_blocking, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & dst, bool non_blocking, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_sparse_mm_search_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_sparse_mm_search_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fc44f65860f09d2861448b9929070abeb8bcae96 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_sparse_mm_search_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cslt_sparse_mm_search { + using schema = int64_t (const at::Tensor &, const at::Tensor &, const ::std::optional &, const ::std::optional &, ::std::optional, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cslt_sparse_mm_search"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int"; + static int64_t call(const at::Tensor & compressed_A, const at::Tensor & dense_B, const ::std::optional & bias, const ::std::optional & alpha, ::std::optional out_dtype, bool transpose_result); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_A, const at::Tensor & dense_B, const ::std::optional & bias, const ::std::optional & alpha, ::std::optional out_dtype, bool transpose_result); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss.h new file mode 100644 index 0000000000000000000000000000000000000000..ee9ca2bae229a4baf6a0529401fd079bfd5db686 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) +inline ::std::tuple _cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity) { + return at::_ops::_cudnn_ctc_loss::call(log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity); +} + +// aten::_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) +inline ::std::tuple _cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity) { + return at::_ops::_cudnn_ctc_loss_Tensor::call(log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity); +} + +// aten::_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _cudnn_ctc_loss_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity) { + return at::_ops::_cudnn_ctc_loss_out::call(log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity, out0, out1); +} +// aten::_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _cudnn_ctc_loss_outf(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, at::Tensor & out0, at::Tensor & out1) { + return at::_ops::_cudnn_ctc_loss_out::call(log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity, out0, out1); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..80219580c7f35528cb79e40c2eb6492d98ae9ba2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cudnn_ctc_loss { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_ctc_loss"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity); +}; + +struct TORCH_API _cudnn_ctc_loss_Tensor { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_ctc_loss"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity); +}; + +struct TORCH_API _cudnn_ctc_loss_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_ctc_loss"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bf36a0b5a2f2814e08362e3f829438df8fd4bb86 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cudnn_init_dropout_state { + using schema = at::Tensor (double, bool, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_init_dropout_state"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor"; + static at::Tensor call(double dropout, bool train, int64_t dropout_seed, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, double dropout, bool train, int64_t dropout_seed, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API _cudnn_init_dropout_state_out { + using schema = at::Tensor & (double, bool, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_init_dropout_state"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_cudnn_init_dropout_state.out(float dropout, bool train, int dropout_seed, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(double dropout, bool train, int64_t dropout_seed, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, double dropout, bool train, int64_t dropout_seed, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_rnn_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_rnn_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..45f442955097aee3bff265b9cfa4bd2d27157a24 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_rnn_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cudnn_rnn { + using schema = ::std::tuple (const at::Tensor &, at::TensorList, int64_t, const ::std::optional &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, c10::SymInt, int64_t, bool, double, bool, bool, c10::SymIntArrayRef, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_rnn"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional & weight_buf, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional & dropout_state); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional & weight_buf, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional & dropout_state); +}; + +struct TORCH_API _cudnn_rnn_out { + using schema = ::std::tuple (const at::Tensor &, at::TensorList, int64_t, const ::std::optional &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, c10::SymInt, int64_t, bool, double, bool, bool, c10::SymIntArrayRef, const ::std::optional &, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_rnn"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_cudnn_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))"; + static ::std::tuple call(const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional & weight_buf, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional & weight_buf, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5c438b64ce7aca7a6a231b50468e1cea1b8d8150 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API int64_t _debug_has_internal_overlap(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..984a0d5072a750908230258c7b29e0af0b65d68e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _debug_has_internal_overlap { + using schema = int64_t (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_debug_has_internal_overlap"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_debug_has_internal_overlap(Tensor self) -> int"; + static int64_t call(const at::Tensor & self); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_efficient_attention_backward_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_efficient_attention_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ff23210d320d2cfebecf9d9f785e402be9b818f7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_efficient_attention_backward_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple _efficient_attention_backward(const at::Tensor & grad_out_, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & bias, const at::Tensor & out, const ::std::optional & cu_seqlens_q, const ::std::optional & cu_seqlens_k, int64_t max_seqlen_q, int64_t max_seqlen_k, const at::Tensor & logsumexp, double dropout_p, const at::Tensor & philox_seed, const at::Tensor & philox_offset, int64_t custom_mask_type, bool bias_requires_grad, ::std::optional scale=::std::nullopt, ::std::optional num_splits_key=::std::nullopt, ::std::optional window_size=::std::nullopt, bool shared_storage_dqdkdv=false); +TORCH_API ::std::tuple _efficient_attention_backward_symint(const at::Tensor & grad_out_, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & bias, const at::Tensor & out, const ::std::optional & cu_seqlens_q, const ::std::optional & cu_seqlens_k, c10::SymInt max_seqlen_q, c10::SymInt max_seqlen_k, const at::Tensor & logsumexp, double dropout_p, const at::Tensor & philox_seed, const at::Tensor & philox_offset, int64_t custom_mask_type, bool bias_requires_grad, ::std::optional scale=::std::nullopt, ::std::optional num_splits_key=::std::nullopt, ::std::optional window_size=::std::nullopt, bool shared_storage_dqdkdv=false); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..7c2541f982101aad537118295f41ac7d34a57a06 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor +inline at::Tensor _embedding_bag_dense_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx); +} +namespace symint { + template >> + at::Tensor _embedding_bag_dense_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx); + } +} + +// aten::_embedding_bag_dense_backward(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor +inline at::Tensor _embedding_bag_dense_backward_symint(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx); +} +namespace symint { + template >> + at::Tensor _embedding_bag_dense_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx); + } +} + +// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); +} +namespace symint { + template >> + at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); + } +} + +// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); +} +namespace symint { + template >> + at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); + } +} + +// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_dense_backward_symint_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); +} +namespace symint { + template >> + at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); + } +} + +// aten::_embedding_bag_dense_backward.out(Tensor grad, Tensor indices, Tensor offset2bag, Tensor bag_size, Tensor maximum_indices, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_dense_backward_symint_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); +} +namespace symint { + template >> + at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_dense_backward_out::call(grad, indices, offset2bag, bag_size, maximum_indices, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d74307ee632e81fcbe5efb98546dc71bac2d4469 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_forward_only_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple _embedding_bag_forward_only_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const ::std::optional & per_sample_weights={}, bool include_last_offset=false, int64_t padding_idx=-1); +TORCH_API ::std::tuple _embedding_bag_forward_only_outf(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const ::std::optional & per_sample_weights, bool include_last_offset, int64_t padding_idx, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f4265bc61c1451282f2ebbeb62941afe3ed42cae --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _embedding_bag_out(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const ::std::optional & per_sample_weights, bool include_last_offset, int64_t padding_idx, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3); +TORCH_API ::std::tuple _embedding_bag_cpu(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const ::std::optional & per_sample_weights={}, bool include_last_offset=false, int64_t padding_idx=-1); +TORCH_API ::std::tuple _embedding_bag_cuda(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const ::std::optional & per_sample_weights={}, bool include_last_offset=false, int64_t padding_idx=-1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..40c48c22253136e3e88e5f85ee3814c25321e936 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor +inline at::Tensor _embedding_bag_per_sample_weights_backward(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_per_sample_weights_backward::call(grad, weight, indices, offsets, offset2bag, mode, padding_idx); +} + +// aten::_embedding_bag_per_sample_weights_backward.out(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_per_sample_weights_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_per_sample_weights_backward_out::call(grad, weight, indices, offsets, offset2bag, mode, padding_idx, out); +} +// aten::_embedding_bag_per_sample_weights_backward.out(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_per_sample_weights_backward_outf(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_per_sample_weights_backward_out::call(grad, weight, indices, offsets, offset2bag, mode, padding_idx, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b24ce717374bcf9d99ab82248d6d9f47abe81796 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _embedding_bag_per_sample_weights_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1); +TORCH_API at::Tensor & _embedding_bag_per_sample_weights_backward_outf(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..dd78590e835d0548ad9df3030e827c5a6428dfdb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _embedding_bag_per_sample_weights_backward(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_affine_quantized_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_affine_quantized_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..a68afff8bd1c495d4e3f9ae82ad62ad4ab40d2b1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_affine_quantized_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _empty_affine_quantized { + using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional, ::std::optional, ::std::optional, ::std::optional, double, int64_t, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_empty_affine_quantized"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor"; + static at::Tensor call(c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, double scale, int64_t zero_point, ::std::optional memory_format); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, double scale, int64_t zero_point, ::std::optional memory_format); +}; + +struct TORCH_API _empty_affine_quantized_out { + using schema = at::Tensor & (c10::SymIntArrayRef, double, int64_t, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_empty_affine_quantized"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_empty_affine_quantized.out(SymInt[] size, *, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(c10::SymIntArrayRef size, double scale, int64_t zero_point, ::std::optional memory_format, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, double scale, int64_t zero_point, ::std::optional memory_format, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..038327f183f308a3fc1781e5006d802c2c1966c7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _empty_per_channel_affine_quantized_out(at::Tensor & out, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format=c10::MemoryFormat::Contiguous); +TORCH_API at::Tensor & _empty_per_channel_affine_quantized_outf(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format, at::Tensor & out); +TORCH_API at::Tensor & _empty_per_channel_affine_quantized_symint_out(at::Tensor & out, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format=c10::MemoryFormat::Contiguous); +TORCH_API at::Tensor & _empty_per_channel_affine_quantized_symint_outf(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine_backward_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c6a523d2ac919c741f14c082beb6b5423e13c17f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine_backward_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple _fake_quantize_learnable_per_channel_affine_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor=1.0); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..81f7ff920c55ea5edd6edb2989515be6a8b9bf57 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _fake_quantize_learnable_per_channel_affine_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor=1.0); +TORCH_API at::Tensor & _fake_quantize_learnable_per_channel_affine_outf(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..ec64714d6b487a1d82ef7156a029968e7e990868 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _fake_quantize_learnable_per_tensor_affine_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) { + return at::_ops::_fake_quantize_learnable_per_tensor_affine_backward::call(grad, self, scale, zero_point, quant_min, quant_max, grad_factor); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2824af81e46863d4d3529011f5ed20ab9bc8a603 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, const at::Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fft_r2c.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fft_r2c.h new file mode 100644 index 0000000000000000000000000000000000000000..7e98b167bc79b7932343df69c19388fed8fe5511 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fft_r2c.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor +inline at::Tensor _fft_r2c(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided) { + return at::_ops::_fft_r2c::call(self, dim, normalization, onesided); +} + +// aten::_fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _fft_r2c_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided) { + return at::_ops::_fft_r2c_out::call(self, dim, normalization, onesided, out); +} +// aten::_fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _fft_r2c_outf(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, bool onesided, at::Tensor & out) { + return at::_ops::_fft_r2c_out::call(self, dim, normalization, onesided, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..3f302ef4d3f695a88175a4a0c4f3c29fbba5d496 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor rng_state, Tensor unused, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale=::std::nullopt, ::std::optional window_size_left=::std::nullopt, ::std::optional window_size_right=::std::nullopt) { + return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left.has_value() ? ::std::make_optional(c10::SymInt(*window_size_left)) : ::std::nullopt, window_size_right.has_value() ? ::std::make_optional(c10::SymInt(*window_size_right)) : ::std::nullopt); +} +namespace symint { + template >> + ::std::tuple _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale=::std::nullopt, ::std::optional window_size_left=::std::nullopt, ::std::optional window_size_right=::std::nullopt) { + return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left.has_value() ? ::std::make_optional(c10::SymInt(*window_size_left)) : ::std::nullopt, window_size_right.has_value() ? ::std::make_optional(c10::SymInt(*window_size_right)) : ::std::nullopt); + } +} + +// aten::_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor rng_state, Tensor unused, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _flash_attention_backward_symint(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale=::std::nullopt, ::std::optional window_size_left=::std::nullopt, ::std::optional window_size_right=::std::nullopt) { + return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left, window_size_right); +} +namespace symint { + template >> + ::std::tuple _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale=::std::nullopt, ::std::optional window_size_left=::std::nullopt, ::std::optional window_size_right=::std::nullopt) { + return at::_ops::_flash_attention_backward::call(grad_out, query, key, value, out, logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, rng_state, unused, scale, window_size_left, window_size_right); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs.h new file mode 100644 index 0000000000000000000000000000000000000000..47400324a259c4ab57e70eef89bb29abfbb2202c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_abs(Tensor[] self) -> Tensor[] +inline ::std::vector _foreach_abs(at::TensorList self) { + return at::_ops::_foreach_abs::call(self); +} + +// aten::_foreach_abs_(Tensor(a!)[] self) -> () +inline void _foreach_abs_(at::TensorList self) { + return at::_ops::_foreach_abs_::call(self); +} + +// aten::_foreach_abs.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_abs_out(at::TensorList out, at::TensorList self) { + return at::_ops::_foreach_abs_out::call(self, out); +} +// aten::_foreach_abs.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_abs_outf(at::TensorList self, at::TensorList out) { + return at::_ops::_foreach_abs_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ffd8acfb086f8b3ddec204204ad8a543d8cde58f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_abs(at::TensorList self); +TORCH_API void _foreach_abs_(at::TensorList self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_asin.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_asin.h new file mode 100644 index 0000000000000000000000000000000000000000..0d2f4593a28ce22251a9bec54095b180ec346467 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_asin.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_asin(Tensor[] self) -> Tensor[] +inline ::std::vector _foreach_asin(at::TensorList self) { + return at::_ops::_foreach_asin::call(self); +} + +// aten::_foreach_asin_(Tensor(a!)[] self) -> () +inline void _foreach_asin_(at::TensorList self) { + return at::_ops::_foreach_asin_::call(self); +} + +// aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_asin_out(at::TensorList out, at::TensorList self) { + return at::_ops::_foreach_asin_out::call(self, out); +} +// aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_asin_outf(at::TensorList self, at::TensorList out) { + return at::_ops::_foreach_asin_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_atan_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_atan_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f7376cf29c88bf5f075b938c284e581e4855f778 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_atan_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_atan(at::TensorList self); +TORCH_API void _foreach_atan_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_atan_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_atan_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_max_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_max_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f2f397c780b26b4ac9ec09ae32bc993c67d29967 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_max_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_clamp_max(at::TensorList self, const at::Scalar & scalar); +TORCH_API void _foreach_clamp_max_(at::TensorList self, const at::Scalar & scalar); +TORCH_API ::std::vector _foreach_clamp_max(at::TensorList self, at::TensorList other); +TORCH_API void _foreach_clamp_max_(at::TensorList self, at::TensorList other); +TORCH_API ::std::vector _foreach_clamp_max(at::TensorList self, at::ArrayRef scalars); +TORCH_API void _foreach_clamp_max_(at::TensorList self, at::ArrayRef scalars); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_min_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_min_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9ceec46773fc6793695cc65168ec06732324a531 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_min_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_clamp_min(at::TensorList self, const at::Scalar & scalar); +TORCH_API void _foreach_clamp_min_(at::TensorList self, const at::Scalar & scalar); +TORCH_API ::std::vector _foreach_clamp_min(at::TensorList self, at::TensorList other); +TORCH_API void _foreach_clamp_min_(at::TensorList self, at::TensorList other); +TORCH_API ::std::vector _foreach_clamp_min(at::TensorList self, at::ArrayRef scalars); +TORCH_API void _foreach_clamp_min_(at::TensorList self, at::ArrayRef scalars); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lerp_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lerp_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c4c652d7de84f87e966eeaff9c053da31b1b95c2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lerp_native.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector foreach_tensor_ternary_lerp_slow(at::TensorList self, at::TensorList tensors1, at::TensorList weights); +TORCH_API void _foreach_lerp_List_out(at::TensorList self, at::TensorList tensors1, at::TensorList weights, at::TensorList out); +TORCH_API void foreach_tensor_ternary_lerp_slow_(at::TensorList self, at::TensorList tensors1, at::TensorList weights); +TORCH_API ::std::vector foreach_tensor_lerp_ternary_cuda(at::TensorList self, at::TensorList tensors1, at::TensorList weights); +TORCH_API void foreach_tensor_lerp_ternary_cuda_(at::TensorList self, at::TensorList tensors1, at::TensorList weights); +TORCH_API ::std::vector foreach_tensor_lerp_list_kernel_slow(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +TORCH_API void _foreach_lerp_Scalar_out(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight, at::TensorList out); +TORCH_API void foreach_tensor_lerp_list_kernel_slow_(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +TORCH_API ::std::vector foreach_tensor_lerp_list_cuda(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +TORCH_API void foreach_tensor_lerp_list_cuda_(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +TORCH_API ::std::vector foreach_tensor_lerp_scalarlist_kernel_slow(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +TORCH_API void _foreach_lerp_ScalarList_out(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight, at::TensorList out); +TORCH_API void foreach_tensor_lerp_scalarlist_kernel_slow_(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +TORCH_API ::std::vector foreach_tensor_lerp_scalarlist_cuda(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +TORCH_API void foreach_tensor_lerp_scalarlist_cuda_(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e407a37dfb1ce1d5900503b286b1626016399456 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_log(at::TensorList self); +TORCH_API void _foreach_log_(at::TensorList self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_norm.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..fb8d4e00acd7fadc8f66e60ce2cb64bbe1d1b6c9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_norm.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[] +inline ::std::vector _foreach_norm(at::TensorList self, const at::Scalar & ord=2, ::std::optional dtype=::std::nullopt) { + return at::_ops::_foreach_norm_Scalar::call(self, ord, dtype); +} + +// aten::_foreach_norm.Scalar_out(Tensor[] self, Scalar ord=2, ScalarType? dtype=None, *, Tensor(a!)[] out) -> () +inline void _foreach_norm_out(at::TensorList out, at::TensorList self, const at::Scalar & ord=2, ::std::optional dtype=::std::nullopt) { + return at::_ops::_foreach_norm_Scalar_out::call(self, ord, dtype, out); +} +// aten::_foreach_norm.Scalar_out(Tensor[] self, Scalar ord=2, ScalarType? dtype=None, *, Tensor(a!)[] out) -> () +inline void _foreach_norm_outf(at::TensorList self, const at::Scalar & ord, ::std::optional dtype, at::TensorList out) { + return at::_ops::_foreach_norm_Scalar_out::call(self, ord, dtype, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal.h new file mode 100644 index 0000000000000000000000000000000000000000..f3aaa6d2ee628419a5a51099d8e8ff6dc1351bc6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_reciprocal(Tensor[] self) -> Tensor[] +inline ::std::vector _foreach_reciprocal(at::TensorList self) { + return at::_ops::_foreach_reciprocal::call(self); +} + +// aten::_foreach_reciprocal_(Tensor(a!)[] self) -> () +inline void _foreach_reciprocal_(at::TensorList self) { + return at::_ops::_foreach_reciprocal_::call(self); +} + +// aten::_foreach_reciprocal.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_reciprocal_out(at::TensorList out, at::TensorList self) { + return at::_ops::_foreach_reciprocal_out::call(self, out); +} +// aten::_foreach_reciprocal.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_reciprocal_outf(at::TensorList self, at::TensorList out) { + return at::_ops::_foreach_reciprocal_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ab7cc3bf5f08125c5d4fa99b674f9edd57295a28 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_reciprocal(at::TensorList self); +TORCH_API void _foreach_reciprocal_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_reciprocal_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_reciprocal_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..09c9336e306177f39b7a2f7e0fdf21c362704502 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_reciprocal(at::TensorList self); +TORCH_API void _foreach_reciprocal_(at::TensorList self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sigmoid_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sigmoid_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..21b5a2e0a8a88061e354cc3be3c35652411ad506 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sigmoid_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_sigmoid(at::TensorList self); +TORCH_API void _foreach_sigmoid_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_sigmoid_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_sigmoid_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2bf4590e7cce3190f772f739e2c0c4c21d352ee9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_sign(at::TensorList self); +TORCH_API void _foreach_sign_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_sign_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_sign_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sqrt_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sqrt_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cec9f2125109bc25555fb550dde41f1ae0c4c337 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sqrt_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_sqrt(at::TensorList self); +TORCH_API void _foreach_sqrt_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_sqrt_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_sqrt_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sub_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sub_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..499adac91d8e33cb8e31ad59799625b01a2e091e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sub_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_sub(at::TensorList self, const at::Scalar & scalar); +TORCH_API void _foreach_sub_(at::TensorList self, const at::Scalar & scalar); +TORCH_API ::std::vector _foreach_sub(at::TensorList self, at::TensorList other, const at::Scalar & alpha=1); +TORCH_API void _foreach_sub_(at::TensorList self, at::TensorList other, const at::Scalar & alpha=1); +TORCH_API ::std::vector _foreach_sub(at::TensorList self, at::ArrayRef scalars); +TORCH_API void _foreach_sub_(at::TensorList self, at::ArrayRef scalars); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f9be73e5e4af26673687433475ff246c2bea037c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_zero(at::TensorList self); +TORCH_API void _foreach_zero_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_zero_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_zero_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ce408fecc0a3972315119f2a338ce6a4a447f304 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API void _foreach_zero_(at::TensorList self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size.h new file mode 100644 index 0000000000000000000000000000000000000000..7b3749c7bec7fef256fbff4d2cc972e6c4458dbe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor +inline at::Tensor _functional_sym_constrain_range_for_size(const at::Scalar & size, ::std::optional min, ::std::optional max, const at::Tensor & dep_token) { + return at::_ops::_functional_sym_constrain_range_for_size::call(size, min, max, dep_token); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1af5d09c4e152720cb81533a7219875cba743ef2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _functional_sym_constrain_range_for_size { + using schema = at::Tensor (const at::Scalar &, ::std::optional, ::std::optional, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_functional_sym_constrain_range_for_size"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor"; + static at::Tensor call(const at::Scalar & size, ::std::optional min, ::std::optional max, const at::Tensor & dep_token); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & size, ::std::optional min, ::std::optional max, const at::Tensor & dep_token); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1cd10660ced37c9da28bb9a564ef3ce5f08a9105 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _functional_sym_constrain_range(const at::Scalar & size, ::std::optional min, ::std::optional max, const at::Tensor & dep_token); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad.h new file mode 100644 index 0000000000000000000000000000000000000000..d21f9bf639992cfca76c45bac8a1fec3e70654cf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad.h @@ -0,0 +1,69 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () +inline void _fused_adagrad_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad_::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf); +} + +// aten::_fused_adagrad_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> () +inline void _fused_adagrad_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad__tensor_lr::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf); +} + +// aten::_fused_adagrad.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> () +inline void _fused_adagrad_out(at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out); +} +// aten::_fused_adagrad.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> () +inline void _fused_adagrad_outf(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out) { + return at::_ops::_fused_adagrad_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out); +} + +// aten::_fused_adagrad(Tensor[] self, Tensor[] grads, Tensor[] state_sums, Tensor[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] state_sums_out, Tensor[] state_steps_out) +inline ::std::tuple<::std::vector,::std::vector,::std::vector,::std::vector> _fused_adagrad(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf); +} + +// aten::_fused_adagrad.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> () +inline void _fused_adagrad_out(at::TensorList out, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad_tensor_lr_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out); +} +// aten::_fused_adagrad.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> () +inline void _fused_adagrad_outf(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out) { + return at::_ops::_fused_adagrad_tensor_lr_out::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf, out); +} + +// aten::_fused_adagrad.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] state_sums_out) +inline ::std::tuple<::std::vector,::std::vector,::std::vector> _fused_adagrad(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}) { + return at::_ops::_fused_adagrad_tensor_lr::call(self, grads, state_sums, state_steps, lr, lr_decay, weight_decay, eps, maximize, grad_scale, found_inf); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3add70e853927d8d9e80cecba4d68a5942489d00 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple _fused_rms_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & rstd, const ::std::optional & weight, ::std::array output_mask); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7e82a8d1315e29aa2450793e5de14442d79f44a5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_rms_norm_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _fused_rms_norm_backward_cuda(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & rstd, const ::std::optional & weight, ::std::array output_mask); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_sdp_choice.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_sdp_choice.h new file mode 100644 index 0000000000000000000000000000000000000000..dfc4ae50e486450db87a0ae828114780d97ff53e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_sdp_choice.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_fused_sdp_choice(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, *, float? scale=None, bool enable_gqa=False) -> int +inline int64_t _fused_sdp_choice(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & attn_mask={}, double dropout_p=0.0, bool is_causal=false, ::std::optional scale=::std::nullopt, bool enable_gqa=false) { + return at::_ops::_fused_sdp_choice::call(query, key, value, attn_mask, dropout_p, is_causal, scale, enable_gqa); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_has_same_storage_numel.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_has_same_storage_numel.h new file mode 100644 index 0000000000000000000000000000000000000000..075f59e8e2dea94c0f73edeba36770e2c3ee0c83 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_has_same_storage_numel.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_has_same_storage_numel(Tensor self, Tensor other) -> bool +inline bool _has_same_storage_numel(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::_has_same_storage_numel::call(self, other); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_histogramdd_bin_edges_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_histogramdd_bin_edges_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3ebbd414f8ae5436b41d69a36fa021bc9d7b5993 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_histogramdd_bin_edges_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::vector _histogramdd_bin_edges(const at::Tensor & self, at::IntArrayRef bins, ::std::optional> range=::std::nullopt, const ::std::optional & weight={}, bool density=false); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..57f52265eb554b090f67af0e7a86be3a4974bf55 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_meta_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor & _index_put_impl_(at::Tensor & self, const c10::List<::std::optional> & indices, const at::Tensor & values, bool accumulate=false, bool unsafe=false); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_int_mm_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_int_mm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..66dd22fe218e5d4ca448978bd637520b13251e75 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_int_mm_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _int_mm_cpu(const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & _int_mm_out_cpu(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); +TORCH_API at::Tensor _int_mm_cuda(const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & _int_mm_out_cuda(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_is_any_true_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_is_any_true_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3f970116420699adb1c87228ac513c1e23f15719 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_is_any_true_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _is_any_true(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_is_zerotensor_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_is_zerotensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b6eb678580de542cb8c3044bc1faea4ba27bd9ad --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_is_zerotensor_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API bool _is_zerotensor(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_jagged_to_padded_dense_forward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_jagged_to_padded_dense_forward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e436589d498f5bc79ba0fa255db2fc75afcc60db --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_jagged_to_padded_dense_forward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _jagged_to_padded_dense_forward_cpu(const at::Tensor & values, at::TensorList offsets, at::IntArrayRef max_lengths, double padding_value=0.0); +TORCH_API at::Tensor _fbgemm_jagged_to_padded_dense_forward(const at::Tensor & values, at::TensorList offsets, at::IntArrayRef max_lengths, double padding_value=0.0); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lazy_clone_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lazy_clone_native.h new file mode 100644 index 0000000000000000000000000000000000000000..abb51ddbbb78e4995d4fd4dfc06ba37007f281d3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lazy_clone_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _lazy_clone(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lazy_clone_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lazy_clone_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..6f53940c472c16e43eeccaa8f5b93957a1d77d3d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lazy_clone_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _lazy_clone { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_lazy_clone"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_lazy_clone(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_eigh_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_eigh_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..27fccf84ec8ae28217d61b44fd8636b6842e6b13 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_eigh_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API ::std::tuple _linalg_eigh(const at::Tensor & A, c10::string_view UPLO="L", bool compute_v=true); +TORCH_API ::std::tuple _linalg_eigh_out(at::Tensor & eigenvalues, at::Tensor & eigenvectors, const at::Tensor & A, c10::string_view UPLO="L", bool compute_v=true); +TORCH_API ::std::tuple _linalg_eigh_outf(const at::Tensor & A, c10::string_view UPLO, bool compute_v, at::Tensor & eigenvalues, at::Tensor & eigenvectors); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_slogdet_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_slogdet_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d299f5dd9425a527abdeba62c0f3f9e7c293e26e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_slogdet_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API ::std::tuple _linalg_slogdet(const at::Tensor & A); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..11790b6389fa75823969ddd95aa03cd319dd91c5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured__linalg_svd : public at::impl::MetaBase { + + + void meta(const at::Tensor & A, bool full_matrices, bool compute_uv, ::std::optional driver); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_local_scalar_dense.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_local_scalar_dense.h new file mode 100644 index 0000000000000000000000000000000000000000..372cf08e89f646a5c7b925bf5c9d549737305aef --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_local_scalar_dense.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_local_scalar_dense(Tensor self) -> Scalar +inline at::Scalar _local_scalar_dense(const at::Tensor & self) { + return at::_ops::_local_scalar_dense::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8ef313606714305d8aa67783ee0bb55104dfa024 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor _log_softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype); +TORCH_API at::Tensor & _log_softmax_backward_data_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype); +TORCH_API at::Tensor & _log_softmax_backward_data_outf(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..13442417e38ef45875e9da399ac74601c018ea66 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _logcumsumexp(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & _logcumsumexp_out(at::Tensor & out, const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & _logcumsumexp_outf(const at::Tensor & self, int64_t dim, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b0eaa1052e385ffef1c5b41c66d534911996bbb3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _logcumsumexp_cpu(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & _logcumsumexp_out_cpu(const at::Tensor & self, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor _logcumsumexp_cuda(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & _logcumsumexp_out_cuda(const at::Tensor & self, int64_t dim, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lstm_mps.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lstm_mps.h new file mode 100644 index 0000000000000000000000000000000000000000..fca80d725ad1264af48467fa58af499bf6ebf16c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lstm_mps.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) +inline ::std::tuple _lstm_mps(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::_lstm_mps::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first); +} + +// aten::_lstm_mps.out(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!)) +inline ::std::tuple _lstm_mps_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5, const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::_lstm_mps_out::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2, out3, out4, out5); +} +// aten::_lstm_mps.out(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!)) +inline ::std::tuple _lstm_mps_outf(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5) { + return at::_ops::_lstm_mps_out::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2, out3, out4, out5); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lu_with_info.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lu_with_info.h new file mode 100644 index 0000000000000000000000000000000000000000..907ff3f14c12147356a55dad34809c865d4f0e47 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_lu_with_info.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info) +inline ::std::tuple _lu_with_info(const at::Tensor & self, bool pivot=true, bool check_errors=true) { + return at::_ops::_lu_with_info::call(self, pivot, check_errors); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_channel_quantized_tensor_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_channel_quantized_tensor_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..24104a392cabc0b0aa865a0c133b36ae4c495d11 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_channel_quantized_tensor_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _make_per_channel_quantized_tensor(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2e73239afcb868afa2d1032289c10d9588ce0188 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _masked_softmax_backward_out(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim, at::Tensor & out); +TORCH_API at::Tensor masked_softmax_backward_cpu(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim=::std::nullopt); +TORCH_API at::Tensor masked_softmax_backward_cuda(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8ae4354935814764678de5d11afa2e89e9bb701c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _masked_softmax_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_masked_softmax_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim); +}; + +struct TORCH_API _masked_softmax_backward_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_masked_softmax_backward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_masked_softmax_backward.out(Tensor grad_output, Tensor output, Tensor mask, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..59c3453ecc8d7b6facd3268e2805cbd6d84f5f71 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _masked_softmax_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, ::std::optional dim=::std::nullopt, ::std::optional mask_type=::std::nullopt); +TORCH_API at::Tensor & _masked_softmax_outf(const at::Tensor & self, const at::Tensor & mask, ::std::optional dim, ::std::optional mask_type, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cd491a92c0f7469ce2531c8eb189c5a0cfebb9bb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _masked_softmax(const at::Tensor & self, const at::Tensor & mask, ::std::optional dim=::std::nullopt, ::std::optional mask_type=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2bdea632e87cff75f9b9e9c60bb2642af150de97 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _masked_softmax_out(const at::Tensor & self, const at::Tensor & mask, ::std::optional dim, ::std::optional mask_type, at::Tensor & out); +TORCH_API at::Tensor masked_softmax_cpu(const at::Tensor & self, const at::Tensor & mask, ::std::optional dim=::std::nullopt, ::std::optional mask_type=::std::nullopt); +TORCH_API at::Tensor masked_softmax_cuda(const at::Tensor & self, const at::Tensor & mask, ::std::optional dim=::std::nullopt, ::std::optional mask_type=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_mixed_dtypes_linear_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_mixed_dtypes_linear_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c3af178fc58eddb3a9499cf9d993a4138d2b5872 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_mixed_dtypes_linear_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _mixed_dtypes_linear { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_mixed_dtypes_linear"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & scale, const ::std::optional & bias, ::std::optional activation); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const at::Tensor & scale, const ::std::optional & bias, ::std::optional activation); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3f9cfb8c53f58e7f261b2a22c1c4c7eff5ad0ad0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_reshape_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _mkldnn_reshape_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef shape); +TORCH_API at::Tensor & _mkldnn_reshape_outf(const at::Tensor & self, at::IntArrayRef shape, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_transpose_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_transpose_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f219e6d87ba323151de51669a4fc7bdb6f8c3ed2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_mkldnn_transpose_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _mkldnn_transpose_out(const at::Tensor & self, int64_t dim0, int64_t dim1, at::Tensor & out); +TORCH_API at::Tensor mkldnn_transpose(const at::Tensor & self, int64_t dim0, int64_t dim1); +TORCH_API at::Tensor & mkldnn_transpose_(at::Tensor & self, int64_t dim0, int64_t dim1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_compute_contiguous_strides_offsets_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_compute_contiguous_strides_offsets_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..e72f964230f3504d92c068faf21b3b6a145939cb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_compute_contiguous_strides_offsets_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _nested_compute_contiguous_strides_offsets { + using schema = ::std::tuple (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_nested_compute_contiguous_strides_offsets"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & nested_size); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & nested_size); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f514d6f3ce2de208196370b85a4a000078b4a05c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _nested_from_padded(const at::Tensor & padded, const at::Tensor & cpu_nested_shape_example, bool fuse_transform_0213=false); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_tensor_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_tensor_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4de1a206d28e19524165a397566759d623b0e17f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_tensor_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _nested_from_padded_tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, const ::std::optional &, const ::std::optional &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_nested_from_padded_tensor"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_nested_from_padded_tensor(Tensor padded, Tensor offsets, Tensor dummy, int ragged_idx=1, Tensor? min_seqlen=None, Tensor? max_seqlen=None, SymInt? sum_S=None) -> Tensor"; + static at::Tensor call(const at::Tensor & padded, const at::Tensor & offsets, const at::Tensor & dummy, int64_t ragged_idx, const ::std::optional & min_seqlen, const ::std::optional & max_seqlen, ::std::optional sum_S); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & padded, const at::Tensor & offsets, const at::Tensor & dummy, int64_t ragged_idx, const ::std::optional & min_seqlen, const ::std::optional & max_seqlen, ::std::optional sum_S); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_ragged_idx_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_ragged_idx_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0463f07d0019e4a3a7c2e520af1d14cb76e15912 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_ragged_idx_native.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask.h new file mode 100644 index 0000000000000000000000000000000000000000..f02af7d29f82643df1ce850045f223ae47c59df1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor +inline at::Tensor _nested_tensor_from_mask(const at::Tensor & t, const at::Tensor & mask, bool mask_check=true) { + return at::_ops::_nested_tensor_from_mask::call(t, mask, mask_check); +} + +// aten::_nested_tensor_from_mask.out(Tensor t, Tensor mask, bool mask_check=True, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _nested_tensor_from_mask_out(at::Tensor & out, const at::Tensor & t, const at::Tensor & mask, bool mask_check=true) { + return at::_ops::_nested_tensor_from_mask_out::call(t, mask, mask_check, out); +} +// aten::_nested_tensor_from_mask.out(Tensor t, Tensor mask, bool mask_check=True, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _nested_tensor_from_mask_outf(const at::Tensor & t, const at::Tensor & mask, bool mask_check, at::Tensor & out) { + return at::_ops::_nested_tensor_from_mask_out::call(t, mask, mask_check, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask_left_aligned_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask_left_aligned_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ab57a85a3f8a8c61fd77638bebb3b0fcde1632d8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask_left_aligned_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API bool _nested_tensor_from_mask_left_aligned(const at::Tensor & t, const at::Tensor & mask); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_size_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_size_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f3151d0806862c1dba7495d28d32e46395ce7b38 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_size_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _nested_tensor_size { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_nested_tensor_size"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_nested_tensor_size(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API _nested_tensor_size_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_nested_tensor_size"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_nested_tensor_size.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4962f1e75348ba9c93c1826b997e93bd42262953 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _nested_view_from_jagged_copy_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const ::std::optional & lengths={}, int64_t ragged_idx=1, const ::std::optional & min_seqlen={}, const ::std::optional & max_seqlen={}); +TORCH_API at::Tensor & _nested_view_from_jagged_copy_outf(const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const ::std::optional & lengths, int64_t ragged_idx, const ::std::optional & min_seqlen, const ::std::optional & max_seqlen, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pack_padded_sequence_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pack_padded_sequence_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..223088a20dbb4589b8b5f893df87bfd8aee1fa67 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pack_padded_sequence_backward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _pack_padded_sequence_backward { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_pack_padded_sequence_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_pack_padded_sequence_backward(Tensor grad, SymInt[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor"; + static at::Tensor call(const at::Tensor & grad, c10::SymIntArrayRef input_size, const at::Tensor & batch_sizes, bool batch_first); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, c10::SymIntArrayRef input_size, const at::Tensor & batch_sizes, bool batch_first); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular.h new file mode 100644 index 0000000000000000000000000000000000000000..cf2eb31239b6da7eb6f50a47cce6dbcb8c2791db --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_pad_circular(Tensor self, SymInt[] pad) -> Tensor +inline at::Tensor _pad_circular(const at::Tensor & self, at::IntArrayRef pad) { + return at::_ops::_pad_circular::call(self, c10::fromIntArrayRefSlow(pad)); +} +namespace symint { + template >> + at::Tensor _pad_circular(const at::Tensor & self, at::IntArrayRef pad) { + return at::_ops::_pad_circular::call(self, c10::fromIntArrayRefSlow(pad)); + } +} + +// aten::_pad_circular(Tensor self, SymInt[] pad) -> Tensor +inline at::Tensor _pad_circular_symint(const at::Tensor & self, c10::SymIntArrayRef pad) { + return at::_ops::_pad_circular::call(self, pad); +} +namespace symint { + template >> + at::Tensor _pad_circular(const at::Tensor & self, c10::SymIntArrayRef pad) { + return at::_ops::_pad_circular::call(self, pad); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular_native.h new file mode 100644 index 0000000000000000000000000000000000000000..585178effb25488c4c24136d5abcbe617f0bfb23 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _pad_circular_symint(const at::Tensor & self, c10::SymIntArrayRef pad); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_packed_sequence_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_packed_sequence_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d6c483ccd1b11e617b36a61e430ca7be667e5c0f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_packed_sequence_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _pad_packed_sequence { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, bool, const at::Scalar &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_pad_packed_sequence"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & data, const at::Tensor & batch_sizes, bool batch_first, const at::Scalar & padding_value, int64_t total_length); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & data, const at::Tensor & batch_sizes, bool batch_first, const at::Scalar & padding_value, int64_t total_length); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..51f2d0a94938706b1550a61afca369b1c6f7d234 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _pdist_backward_out(const at::Tensor & grad, const at::Tensor & self, double p, const at::Tensor & pdist, at::Tensor & out); +TORCH_API at::Tensor _pdist_backward(const at::Tensor & grad, const at::Tensor & self, double p, const at::Tensor & pdist); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0ecea6d7d1f4740ef09ee4487f026919679ad164 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _pdist_forward_out(at::Tensor & out, const at::Tensor & self, double p=2); +TORCH_API at::Tensor & _pdist_forward_outf(const at::Tensor & self, double p, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a77e26cec8cba39479bd42aa5f366531cb61840f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _pdist_forward(const at::Tensor & self, double p=2); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_prelu_kernel_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_prelu_kernel_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..78559a7fb8cfc286a203b4720c7123bd3cdc88c0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_prelu_kernel_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _prelu_kernel_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight); +TORCH_API ::std::tuple mkldnn_prelu_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..00ac62d6ed2dbe916b996f7ab025bf2dd6ce96d6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_meta_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor _reshape_alias(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride); +TORCH_API at::Tensor _reshape_alias_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_copy.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..b8f5bd5efdc8a5d9ea0c4e3a038d57c5ab645793 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_copy.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_reshape_copy(Tensor self, SymInt[] size) -> Tensor +inline at::Tensor _reshape_copy(const at::Tensor & self, at::IntArrayRef size) { + return at::_ops::_reshape_copy::call(self, c10::fromIntArrayRefSlow(size)); +} +namespace symint { + template >> + at::Tensor _reshape_copy(const at::Tensor & self, at::IntArrayRef size) { + return at::_ops::_reshape_copy::call(self, c10::fromIntArrayRefSlow(size)); + } +} + +// aten::_reshape_copy(Tensor self, SymInt[] size) -> Tensor +inline at::Tensor _reshape_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size) { + return at::_ops::_reshape_copy::call(self, size); +} +namespace symint { + template >> + at::Tensor _reshape_copy(const at::Tensor & self, c10::SymIntArrayRef size) { + return at::_ops::_reshape_copy::call(self, size); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention.h new file mode 100644 index 0000000000000000000000000000000000000000..b1a7d306565d2ff7b53c4c3c7c2ae10b21ed2871 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset) +inline ::std::tuple _scaled_dot_product_efficient_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & attn_bias, bool compute_log_sumexp, double dropout_p=0.0, bool is_causal=false, ::std::optional scale=::std::nullopt) { + return at::_ops::_scaled_dot_product_efficient_attention::call(query, key, value, attn_bias, compute_log_sumexp, dropout_p, is_causal, scale); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3a1eefa0e230a2930bc0298ce041f486ff0407f4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_backward_cuda(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, ::std::optional scale=::std::nullopt); +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_backward_nested(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, ::std::optional scale=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..6d8cd079fbf6338ef13cd5fe86de077b0af45c85 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_cpu_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, double dropout_p, bool is_causal, const ::std::optional & attn_mask={}, ::std::optional scale=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..811ad7ef3b80b8a9100c6cc0175f679fb52939af --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_for_cpu(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, const ::std::optional & attn_mask={}, ::std::optional scale=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h new file mode 100644 index 0000000000000000000000000000000000000000..8d166a3dd1b0fac5450b2185fec166f0a7a012e8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_cpu(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, const ::std::optional & attn_mask={}, ::std::optional scale=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e4ff4b1170fe6159fb55c432347102b482b264ab --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_cuda(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, bool return_debug_mask=false, ::std::optional scale=::std::nullopt); +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_nestedtensor_cuda(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, bool return_debug_mask=false, ::std::optional scale=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_mm_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_mm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c89233cc12bb80da96d50f6a9aa8d1d37374d139 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_mm_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _scaled_mm_cpu(const at::Tensor & self, const at::Tensor & mat2, const at::Tensor & scale_a, const at::Tensor & scale_b, const ::std::optional & bias={}, const ::std::optional & scale_result={}, ::std::optional out_dtype=::std::nullopt, bool use_fast_accum=false); +TORCH_API at::Tensor & _scaled_mm_out_cpu(const at::Tensor & self, const at::Tensor & mat2, const at::Tensor & scale_a, const at::Tensor & scale_b, const ::std::optional & bias, const ::std::optional & scale_result, ::std::optional out_dtype, bool use_fast_accum, at::Tensor & out); +TORCH_API at::Tensor _scaled_mm_cuda(const at::Tensor & self, const at::Tensor & mat2, const at::Tensor & scale_a, const at::Tensor & scale_b, const ::std::optional & bias={}, const ::std::optional & scale_result={}, ::std::optional out_dtype=::std::nullopt, bool use_fast_accum=false); +TORCH_API at::Tensor & _scaled_mm_out_cuda(const at::Tensor & self, const at::Tensor & mat2, const at::Tensor & scale_a, const at::Tensor & scale_b, const ::std::optional & bias, const ::std::optional & scale_result, ::std::optional out_dtype, bool use_fast_accum, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_ff_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_ff_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..471c3bfaa06567b2149278f567b1bbea6b44ea8d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_ff_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sobol_engine_ff_ { + using schema = at::Tensor & (at::Tensor &, int64_t, const at::Tensor &, int64_t, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sobol_engine_ff_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, int64_t n, const at::Tensor & sobolstate, int64_t dimension, int64_t num_generated); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t n, const at::Tensor & sobolstate, int64_t dimension, int64_t num_generated); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_initialize_state.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_initialize_state.h new file mode 100644 index 0000000000000000000000000000000000000000..b2d744d5f8a24c9dfc7b80a599dc417e9f80ef79 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_initialize_state.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!) +inline at::Tensor & _sobol_engine_initialize_state_(at::Tensor & self, int64_t dimension) { + return at::_ops::_sobol_engine_initialize_state_::call(self, dimension); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_backward_data_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_backward_data_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..f2745c3ffc6efb4a77af718e2a95cb20e7915895 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_backward_data_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured__softmax_backward_data : public at::impl::MetaBase { + + + void meta(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..43d337f6870b7175a1ab17a366be95728ea57118 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured__softmax : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, int64_t dim, bool half_to_float); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to.h new file mode 100644 index 0000000000000000000000000000000000000000..911336e9f484aa76ff266fb1f3c0f0d197222736 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a) +inline at::Tensor _sparse_broadcast_to(const at::Tensor & self, at::IntArrayRef size) { + return at::_ops::_sparse_broadcast_to::call(self, size); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to_copy_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e9d198d9191643e221ec9a58a8e88d36c2879ccf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _sparse_broadcast_to_copy_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size); +TORCH_API at::Tensor & _sparse_broadcast_to_copy_outf(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_prod_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_prod_native.h new file mode 100644 index 0000000000000000000000000000000000000000..89a1960067ab9ec686fbb7222514f6138c6a852a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_prod_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _sparse_csr_prod_dim_dtype_out(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out); +TORCH_API at::Tensor _sparse_csr_prod_cpu(const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor _sparse_csr_prod_cuda(const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_tensor_unsafe_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_tensor_unsafe_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e7f2ab4cdb8217e321b98a308306a2e6f9b797ad --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_tensor_unsafe_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _sparse_csr_tensor_unsafe(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..a3639ea96f2587566cbce6f2af9c52e77e510f59 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax.h @@ -0,0 +1,55 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor +inline at::Tensor _sparse_log_softmax(const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::_sparse_log_softmax_int::call(self, dim, dtype); +} + +// aten::_sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor _sparse_log_softmax(const at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::_sparse_log_softmax_Dimname::call(self, dim, dtype); +} + +// aten::_sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor +inline at::Tensor _sparse_log_softmax(const at::Tensor & self, int64_t dim, bool half_to_float) { + return at::_ops::_sparse_log_softmax::call(self, dim, half_to_float); +} + +// aten::_sparse_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_log_softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float) { + return at::_ops::_sparse_log_softmax_out::call(self, dim, half_to_float, out); +} +// aten::_sparse_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_log_softmax_outf(const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out) { + return at::_ops::_sparse_log_softmax_out::call(self, dim, half_to_float, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..be054e403728f881d3ea5a073c6c151e40a4f08c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mm_reduce_impl_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _sparse_mm_reduce_impl_backward_sparse_csr_cpu(const at::Tensor & self, const at::Tensor & grad_out, const at::Tensor & weight, c10::string_view reduce, const at::Tensor & arg_out, ::std::array output_mask); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8ba9054ff7481c1502e229ec74d57e621450ffe7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_semi_structured_addmm_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _sparse_semi_structured_addmm(const at::Tensor & input, const at::Tensor & mat1, const at::Tensor & mat1_meta, const at::Tensor & mat2, const at::Scalar & alpha=1, const at::Scalar & beta=1, ::std::optional out_dtype=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_softmax.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..dadd446e1558cdd30f8916690c307be7f38e78ef --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_softmax.h @@ -0,0 +1,55 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor +inline at::Tensor _sparse_softmax(const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::_sparse_softmax_int::call(self, dim, dtype); +} + +// aten::_sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor _sparse_softmax(const at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::_sparse_softmax_Dimname::call(self, dim, dtype); +} + +// aten::_sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor +inline at::Tensor _sparse_softmax(const at::Tensor & self, int64_t dim, bool half_to_float) { + return at::_ops::_sparse_softmax::call(self, dim, half_to_float); +} + +// aten::_sparse_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float) { + return at::_ops::_sparse_softmax_out::call(self, dim, half_to_float, out); +} +// aten::_sparse_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_softmax_outf(const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out) { + return at::_ops::_sparse_softmax_out::call(self, dim, half_to_float, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_native.h new file mode 100644 index 0000000000000000000000000000000000000000..930d59669a87fbcd43da5716fd3cb2f31ea67dd7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _sparse_sparse_matmul_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor sparse_sparse_matmul_cpu(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor sparse_sparse_matmul_cuda(const at::Tensor & self, const at::Tensor & other); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..e4d589fdeb2f5109a92d5043269eda697fc8fffa --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sparse_sparse_matmul { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_sparse_matmul"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API _sparse_sparse_matmul_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_sparse_matmul"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_sparse_sparse_matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sum_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sum_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0ca2cd18cae7365fd8f189075f15072b75aa8ab9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sum_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _sparse_sum(const at::Tensor & self); +TORCH_API at::Tensor _sparse_sum(const at::Tensor & self, at::ScalarType dtype); +TORCH_API at::Tensor _sparse_sum(const at::Tensor & self, at::IntArrayRef dim); +TORCH_API at::Tensor & _sparse_sum_dim_out(const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out); +TORCH_API at::Tensor _sparse_sum(const at::Tensor & self, at::IntArrayRef dim, at::ScalarType dtype); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_stack_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_stack_native.h new file mode 100644 index 0000000000000000000000000000000000000000..fba4c186ab9052d3309a38cbddad2dfc35b36a3d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_stack_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _stack(at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & _stack_out(at::TensorList tensors, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor _stack_cpu(at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & _stack_out_cpu(at::TensorList tensors, int64_t dim, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c249a0fc07d1461b42d58519266805bc51a6a94e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _standard_gamma_out(at::Tensor & out, const at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & _standard_gamma_outf(const at::Tensor & self, ::std::optional generator, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8b6f881f166a30ba184f2ec5f9af048b887dd640 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _standard_gamma_grad_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & output); +TORCH_API at::Tensor & _standard_gamma_grad_outf(const at::Tensor & self, const at::Tensor & output, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..54341165b2b3f4bf25433b7898df896efeef2751 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _standard_gamma_grad(const at::Tensor & self, const at::Tensor & output); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f40182a055fe089878a17d09c7f6f2f8366a7bb0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _test_autograd_multiple_dispatch(const at::Tensor & self, bool b); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..219fa6399b3b02a7c7ae8cd3a92a00e76dc672b1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_test_check_tensor(Tensor self) -> Tensor +inline at::Tensor _test_check_tensor(const at::Tensor & self) { + return at::_ops::_test_check_tensor::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..752fc286ba3dc65363c80c6034abed9424fee34d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _test_check_tensor(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_functorch_fallback.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_functorch_fallback.h new file mode 100644 index 0000000000000000000000000000000000000000..fa3986e2d107e545a79bea4ab5d1cebbb16e0bce --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_functorch_fallback.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_test_functorch_fallback(Tensor self, Tensor other) -> Tensor +inline at::Tensor _test_functorch_fallback(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::_test_functorch_fallback::call(self, other); +} + +// aten::_test_functorch_fallback.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _test_functorch_fallback_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::_test_functorch_fallback_out::call(self, other, out); +} +// aten::_test_functorch_fallback.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _test_functorch_fallback_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::_test_functorch_fallback_out::call(self, other, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_optional_floatlist_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_optional_floatlist_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f0ecddf3e0510315944de0337a9442c789158245 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_optional_floatlist_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _test_optional_floatlist_out(at::Tensor & out, const at::Tensor & values, ::std::optional> addends); +TORCH_API at::Tensor & _test_optional_floatlist_outf(const at::Tensor & values, ::std::optional> addends, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d9270da98fca4dc399ae91cdaa6644841de9b95b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _test_serialization_subcmul { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_test_serialization_subcmul"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_warn_in_autograd_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_warn_in_autograd_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ebc37dbc78804386bbea23d0abd0fe00d28939ab --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_test_warn_in_autograd_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _test_warn_in_autograd { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_test_warn_in_autograd"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_test_warn_in_autograd(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API _test_warn_in_autograd_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_test_warn_in_autograd"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_test_warn_in_autograd.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..5d2586820a918d37e3b4d7bf43d3af4e05b4edca --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _thnn_fused_gru_cell { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_thnn_fused_gru_cell"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias, const ::std::optional & hidden_bias); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias, const ::std::optional & hidden_bias); +}; + +struct TORCH_API _thnn_fused_gru_cell_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, const ::std::optional &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_thnn_fused_gru_cell"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_thnn_fused_gru_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias, const ::std::optional & hidden_bias, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias, const ::std::optional & hidden_bias, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..d1ce0340c7d1100eff4e1c5bf1125d3d565cdead --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor _to_copy(const at::Tensor & self, at::TensorOptions options={}, bool non_blocking=false, ::std::optional memory_format=::std::nullopt) { + return at::_ops::_to_copy::call(self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), non_blocking, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); +} +// aten::_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor _to_copy(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, bool non_blocking, ::std::optional memory_format) { + return at::_ops::_to_copy::call(self, dtype, layout, device, pin_memory, non_blocking, memory_format); +} + +// aten::_to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _to_copy_out(at::Tensor & out, const at::Tensor & self, bool non_blocking=false, ::std::optional memory_format=::std::nullopt) { + return at::_ops::_to_copy_out::call(self, non_blocking, memory_format, out); +} +// aten::_to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _to_copy_outf(const at::Tensor & self, bool non_blocking, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::_to_copy_out::call(self, non_blocking, memory_format, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d2474019f7853b95bd4e81a1f3f66bb4f7bfb88e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _to_copy(const at::Tensor & self, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, bool non_blocking=false, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor & _to_copy_out(const at::Tensor & self, bool non_blocking, ::std::optional memory_format, at::Tensor & out); +TORCH_API at::Tensor _to_copy_nested(const at::Tensor & self, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, bool non_blocking=false, ::std::optional memory_format=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0d01020c46abca44f72082c8f37cd0c9cc88537d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _to_copy { + using schema = at::Tensor (const at::Tensor &, ::std::optional, ::std::optional, ::std::optional, ::std::optional, bool, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_to_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, bool non_blocking, ::std::optional memory_format); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, bool non_blocking, ::std::optional memory_format); +}; + +struct TORCH_API _to_copy_out { + using schema = at::Tensor & (const at::Tensor &, bool, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_to_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, bool non_blocking, ::std::optional memory_format, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking, ::std::optional memory_format, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7e38fce86eb4a5cf711e50d808e9b5cf2ce30942 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _to_sparse_bsr { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_to_sparse_bsr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim); +}; + +struct TORCH_API _to_sparse_bsr_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_to_sparse_bsr"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_to_sparse_bsr.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csc_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csc_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..939346d9f1dcb5a7232297ed33578561da565021 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csc_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _to_sparse_csc(const at::Tensor & self, ::std::optional dense_dim=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csr_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csr_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cc469abb37d75ebad8b18be6da4e2679bdb4a1af --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csr_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _to_sparse_csr(const at::Tensor & self, ::std::optional dense_dim=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_transform_bias_rescale_qkv_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_transform_bias_rescale_qkv_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0ee3248d33e912610d22bec1ba303798f769823a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_transform_bias_rescale_qkv_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple _transform_bias_rescale_qkv_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads); +TORCH_API ::std::tuple _transform_bias_rescale_qkv_outf(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_multi_head_attention_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_multi_head_attention_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ebe2db9719d00aa0b049ac5c65d262d111104468 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_multi_head_attention_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _triton_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const ::std::optional & mask={}); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d2bd218485f9d276058ce2446cccd4bc123a8e60 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _triton_scaled_dot_attention_out(at::Tensor & out, const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p=0.0); +TORCH_API at::Tensor & _triton_scaled_dot_attention_outf(const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique.h new file mode 100644 index 0000000000000000000000000000000000000000..b6891eedd3cf1797bac158db4b4d64bc2ca575cc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor) +inline ::std::tuple _unique(const at::Tensor & self, bool sorted=true, bool return_inverse=false) { + return at::_ops::_unique::call(self, sorted, return_inverse); +} + +// aten::_unique.out(Tensor self, bool sorted=True, bool return_inverse=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _unique_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, bool sorted=true, bool return_inverse=false) { + return at::_ops::_unique_out::call(self, sorted, return_inverse, out0, out1); +} +// aten::_unique.out(Tensor self, bool sorted=True, bool return_inverse=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _unique_outf(const at::Tensor & self, bool sorted, bool return_inverse, at::Tensor & out0, at::Tensor & out1) { + return at::_ops::_unique_out::call(self, sorted, return_inverse, out0, out1); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..36172229b4fc54d1a0e0ad09f64f589570324127 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple _unique(const at::Tensor & self, bool sorted=true, bool return_inverse=false); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..73f2304be212fda3479f6266fb381295459cd817 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple _unique(const at::Tensor & self, bool sorted=true, bool return_inverse=false); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0e1bc0e9b273277f6707ee1d5d9a28a5fd15d138 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _unique { + using schema = ::std::tuple (const at::Tensor &, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_unique"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & self, bool sorted, bool return_inverse); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool sorted, bool return_inverse); +}; + +struct TORCH_API _unique_out { + using schema = ::std::tuple (const at::Tensor &, bool, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_unique"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_unique.out(Tensor self, bool sorted=True, bool return_inverse=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & self, bool sorted, bool return_inverse, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool sorted, bool return_inverse, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..87b55d7b8275073493f218460560cf468dc3bbae --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor _unsafe_masked_index(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Scalar & fill); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7c7d8b5970ca591e5ea630599c939ca62274e151 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _unsafe_masked_index(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Scalar & fill); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3ada4a7be7abc8f3c84b3fbec2efcb6939bfe577 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _unsafe_masked_index_put_accumulate(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Tensor & values); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..716b3587080fd6c7623546d8a8ab39b50dbf429b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _unsafe_masked_index_put_accumulate { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::List<::std::optional> &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_unsafe_masked_index_put_accumulate"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_unsafe_masked_index_put_accumulate(Tensor self, Tensor mask, Tensor?[] indices, Tensor values) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Tensor & values); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Tensor & values); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d26e88f4d0dd26288c607b5d12b3824cb1cc7975 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor _upsample_bicubic2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_bicubic2d_aa_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0300e1f673106d97d40900676b505487f5628920 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _upsample_bicubic2d_aa_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, ::std::optional, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_upsample_bicubic2d_aa_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); +}; + +struct TORCH_API _upsample_bicubic2d_aa_backward { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_upsample_bicubic2d_aa_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a327a94d0712e6328cbea4551994f9343fba1ecb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _upsample_bilinear2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_bilinear2d_aa_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e66ac643f7fdc2cef5bf38856463fcf0ad6072f3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _upsample_bilinear2d_aa(const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, ::std::optional> scale_factors); +TORCH_API at::Tensor _upsample_bilinear2d_aa_symint(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, ::std::optional> scale_factors); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..1dcc8a9909c6df47bb47fc36f03ed899ae974fbc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_backward.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::_upsample_nearest_exact1d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::_upsample_nearest_exact1d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input); + } +} + +// aten::_upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact1d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales, at::Tensor & grad_input) { + return at::_ops::_upsample_nearest_exact1d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact1d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales, at::Tensor & grad_input) { + return at::_ops::_upsample_nearest_exact1d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input); + } +} + +// aten::_upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact1d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::_upsample_nearest_exact1d_backward_grad_input::call(grad_output, output_size, input_size, scales, grad_input); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::_upsample_nearest_exact1d_backward_grad_input::call(grad_output, output_size, input_size, scales, grad_input); + } +} + +// aten::_upsample_nearest_exact1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact1d_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales, at::Tensor & grad_input) { + return at::_ops::_upsample_nearest_exact1d_backward_grad_input::call(grad_output, output_size, input_size, scales, grad_input); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact1d_backward_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales, at::Tensor & grad_input) { + return at::_ops::_upsample_nearest_exact1d_backward_grad_input::call(grad_output, output_size, input_size, scales, grad_input); + } +} + +// aten::_upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor +inline at::Tensor _upsample_nearest_exact1d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::_upsample_nearest_exact1d_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact1d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::_upsample_nearest_exact1d_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales); + } +} + +// aten::_upsample_nearest_exact1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor +inline at::Tensor _upsample_nearest_exact1d_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::_upsample_nearest_exact1d_backward::call(grad_output, output_size, input_size, scales); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact1d_backward(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::_upsample_nearest_exact1d_backward::call(grad_output, output_size, input_size, scales); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..32a94a5a500d88c3d390761e8aeab05074f4aba8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _upsample_nearest_exact1d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor _upsample_nearest_exact1d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact1d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales, at::Tensor & out); +TORCH_API at::Tensor & _upsample_nearest_exact1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact2d_backward_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact2d_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..44e8f550e8e78b94bba586917a7e39ac0acf1869 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact2d_backward_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _upsample_nearest_exact2d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_nearest_exact2d_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact2d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); +TORCH_API at::Tensor & _upsample_nearest_exact2d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact2d_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h new file mode 100644 index 0000000000000000000000000000000000000000..d52eb044dd747fbf60232e6c36a5a7c044eb772a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h @@ -0,0 +1,119 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor +inline at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalIntArrayRef output_size, ::std::optional> scale_factors) { + return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*output_size)) : ::std::nullopt, scale_factors); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalIntArrayRef output_size, ::std::optional> scale_factors) { + return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*output_size)) : ::std::nullopt, scale_factors); + } +} + +// aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor +inline at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional> scale_factors) { + return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size, scale_factors); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional> scale_factors) { + return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size, scale_factors); + } +} + +// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out); + } +} + +// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out); + } +} + +// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out); + } +} + +// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _upsample_nearest_exact3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out); +} +namespace symint { + template >> + at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out) { + return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out); + } +} + +// aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor +inline at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w); + } +} + +// aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor +inline at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d::call(self, output_size, scales_d, scales_h, scales_w); +} +namespace symint { + template >> + at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_nearest_exact3d::call(self, output_size, scales_d, scales_h, scales_w); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ce1394832fda673c044b579554af20ee047b435c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & _upsample_nearest_exact3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..920b6bb15d74b7ff09cdb07a8fc53faa9c2c81eb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & _upsample_nearest_exact3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h new file mode 100644 index 0000000000000000000000000000000000000000..143ebaf74c8ba2b8b54159a1ae5586287301f4d4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool +inline bool _use_cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank) { + return at::_ops::_use_cudnn_ctc_loss::call(log_probs, targets, input_lengths, target_lengths, blank); +} + +// aten::_use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool +inline bool _use_cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank) { + return at::_ops::_use_cudnn_ctc_loss_Tensor::call(log_probs, targets, input_lengths, target_lengths, blank); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_compressed_sparse_indices_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_compressed_sparse_indices_native.h new file mode 100644 index 0000000000000000000000000000000000000000..8ec5be86aff469f10ceb0eb9c335f5f6513e3ca2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_compressed_sparse_indices_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API void _validate_compressed_sparse_indices_cpu(bool is_crow, const at::Tensor & compressed_idx, const at::Tensor & plain_idx, int64_t cdim, int64_t dim, int64_t nnz); +TORCH_API void _validate_compressed_sparse_indices_cuda(bool is_crow, const at::Tensor & compressed_idx, const at::Tensor & plain_idx, int64_t cdim, int64_t dim, int64_t nnz); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..600078dc02cf1542fc314968fc5752c7029742e1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _validate_sparse_bsc_tensor_args { + using schema = void (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_validate_sparse_bsc_tensor_args"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()"; + static void call(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args.h new file mode 100644 index 0000000000000000000000000000000000000000..26adb70a8e4ed9ab4f15b0a54981b07129a26b24 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> () +inline void _validate_sparse_bsr_tensor_args(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning=::std::nullopt) { + return at::_ops::_validate_sparse_bsr_tensor_args::call(crow_indices, col_indices, values, size, check_pinning); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c7d6708303629d8bab44c9e45f0f0e6ca3337256 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _validate_sparse_bsr_tensor_args { + using schema = void (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_validate_sparse_bsr_tensor_args"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_validate_sparse_bsr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()"; + static void call(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args.h new file mode 100644 index 0000000000000000000000000000000000000000..0a9928ceda0e5b1db79d53c11a350dbb1e6821b6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None, bool? check_pinning=None) -> () +inline void _validate_sparse_coo_tensor_args(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional is_coalesced=::std::nullopt, ::std::optional check_pinning=::std::nullopt) { + return at::_ops::_validate_sparse_coo_tensor_args::call(indices, values, size, is_coalesced, check_pinning); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1875fc30a94296112e6a8e80bbd458cae0314779 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _validate_sparse_coo_tensor_args { + using schema = void (const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_validate_sparse_coo_tensor_args"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None, bool? check_pinning=None) -> ()"; + static void call(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional is_coalesced, ::std::optional check_pinning); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional is_coalesced, ::std::optional check_pinning); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csr_tensor_args.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csr_tensor_args.h new file mode 100644 index 0000000000000000000000000000000000000000..0549a303cc0d7859e43f7e8829d8f86eaeca5c6b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csr_tensor_args.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> () +inline void _validate_sparse_csr_tensor_args(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning=::std::nullopt) { + return at::_ops::_validate_sparse_csr_tensor_args::call(crow_indices, col_indices, values, size, check_pinning); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_values_copy_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_values_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1f87c77ef661d05dcde2cc4a74bbf73b0d3f00e6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_values_copy_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _values_copy_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor _values_copy(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_values_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_values_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3f90a341a103dbf05c5e33e7de737f74f0690341 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_values_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _values { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_values"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_values(Tensor(a) self) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_int4pack_mm_with_scales_and_zeros_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_int4pack_mm_with_scales_and_zeros_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0463f07d0019e4a3a7c2e520af1d14cb76e15912 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_int4pack_mm_with_scales_and_zeros_native.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_differentiable_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_differentiable_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..f3625fb931b4817778e6d62967faf2a2002493b3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_differentiable_backward.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) +inline ::std::tuple _weight_norm_differentiable_backward(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim) { + return at::_ops::_weight_norm_differentiable_backward::call(grad_w, saved_v, saved_g, saved_norms, dim); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9f2401ac35082d0798cc3ec6b34a9babdeecee25 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _weight_norm_interface_backward_out(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim, at::Tensor & out0, at::Tensor & out1); +TORCH_API ::std::tuple weight_norm_backward_cpu(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim); +TORCH_API ::std::tuple weight_norm_backward_cuda(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_linear_prepack_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_linear_prepack_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ddc82b6fb6b64aa734c70bd4aed92de92411f972 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_linear_prepack_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _wrapped_linear_prepack(const at::Tensor & weight, const at::Tensor & weight_scale, const at::Tensor & weight_zero_point, const at::Tensor & bias); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_quantized_linear_prepacked_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_quantized_linear_prepacked_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..98bc32b7020176294200047a1b4d50348297ae3d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/_wrapped_quantized_linear_prepacked_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _wrapped_quantized_linear_prepacked { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_wrapped_quantized_linear_prepacked"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_wrapped_quantized_linear_prepacked(Tensor input, Tensor input_scale, Tensor input_zero_point, Tensor packed_weight, Tensor output_scale, Tensor output_zero_point, int out_channel) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & input_scale, const at::Tensor & input_zero_point, const at::Tensor & packed_weight, const at::Tensor & output_scale, const at::Tensor & output_zero_point, int64_t out_channel); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & input_scale, const at::Tensor & input_zero_point, const at::Tensor & packed_weight, const at::Tensor & output_scale, const at::Tensor & output_zero_point, int64_t out_channel); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/abs_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/abs_native.h new file mode 100644 index 0000000000000000000000000000000000000000..cc9d18c80ee8d89927ce363b2196420910207b46 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/abs_native.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor abs(const at::Tensor & self); +TORCH_API at::Tensor & abs_(at::Tensor & self); +TORCH_API at::Tensor & abs_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor NestedTensor_abs(const at::Tensor & self); +TORCH_API at::Tensor & NestedTensor_abs_(at::Tensor & self); +TORCH_API at::Tensor abs_sparse(const at::Tensor & self); +TORCH_API at::Tensor & abs_sparse_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & abs_sparse_(at::Tensor & self); +TORCH_API at::Tensor abs_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & abs_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & abs_sparse_csr_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d.h new file mode 100644 index 0000000000000000000000000000000000000000..23a04ae49197dbab6b7082a3cccf6c09b81b9170 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); +} +namespace symint { + template >> + at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); + } +} + +// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) { + return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); +} +namespace symint { + template >> + at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) { + return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); + } +} + +// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out); +} +namespace symint { + template >> + at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out); + } +} + +// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) { + return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out); +} +namespace symint { + template >> + at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) { + return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out); + } +} + +// aten::adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor +inline at::Tensor adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size)); +} +namespace symint { + template >> + at::Tensor adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size)); + } +} + +// aten::adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor +inline at::Tensor adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d::call(self, output_size); +} +namespace symint { + template >> + at::Tensor adaptive_avg_pool2d(const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d::call(self, output_size); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b7a694fdd8830471714f56928c94455f41d6549e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size); +TORCH_API at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out); +TORCH_API at::Tensor & adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size); +TORCH_API at::Tensor & adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..98822d0c96f16a4419610620aa64bb9726207982 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::adaptive_avg_pool3d_backward_grad_input::call(grad_output, self, grad_input); +} +// aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) { + return at::_ops::adaptive_avg_pool3d_backward_grad_input::call(grad_output, self, grad_input); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..46d2c374bf0b738cade598885aa0ec5389972b70 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API adaptive_max_pool1d { + using schema = ::std::tuple (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::adaptive_max_pool1d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & self, at::IntArrayRef output_size); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..8330cc6042604efbe8beff0676c62a4ffc91210e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_adaptive_max_pool2d_backward_out_cpu : public at::meta::structured_adaptive_max_pool2d_backward { +void impl(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, const at::Tensor & grad_input); +}; +struct TORCH_API structured_adaptive_max_pool2d_backward_out_cuda : public at::meta::structured_adaptive_max_pool2d_backward { +void impl(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, const at::Tensor & grad_input); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2082e948e489b05d505222d09c7ebd8a267d696b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API adaptive_max_pool2d_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::adaptive_max_pool2d_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input); +}; + +struct TORCH_API adaptive_max_pool2d_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::adaptive_max_pool2d_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..e7da7709328608e719aa367c3044093196489553 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_max_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) { + return at::_ops::adaptive_max_pool3d_backward_grad_input::call(grad_output, self, indices, grad_input); +} +// aten::adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_max_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input) { + return at::_ops::adaptive_max_pool3d_backward_grad_input::call(grad_output, self, indices, grad_input); +} + +// aten::adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor +inline at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) { + return at::_ops::adaptive_max_pool3d_backward::call(grad_output, self, indices); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5262359de7788dbca32a7d8c6f1141e017c9c490 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/add.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/add.h new file mode 100644 index 0000000000000000000000000000000000000000..b316ed943e1cc3519a118da0315eec0d6c225240 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/add.h @@ -0,0 +1,59 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor +inline at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) { + return at::_ops::add_Tensor::call(self, other, alpha); +} + +// aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & add_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) { + return at::_ops::add_out::call(self, other, alpha, out); +} +// aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & add_outf(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out) { + return at::_ops::add_out::call(self, other, alpha, out); +} + +// aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor +inline at::Tensor add(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) { + return at::_ops::add_Scalar::call(self, other, alpha); +} + +// aten::add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & add_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) { + return at::_ops::add_Scalar_out::call(self, other, alpha, out); +} +// aten::add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & add_outf(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out) { + return at::_ops::add_Scalar_out::call(self, other, alpha, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/add_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/add_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6e2b2e15c4be4b38d0bbd2314521e3c452186390 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/add_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & add_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addcmul_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addcmul_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..fe4e21672fa297a91c6fa60855fc4f9c7fc6d593 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addcmul_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_addcmul : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b4da70b420338b3a31dd3c7f5c0c0d9aa691ce11 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_native.h @@ -0,0 +1,42 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_addmm_out_cpu : public at::meta::structured_addmm { +void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, const at::Tensor & out); +}; +struct TORCH_API structured_addmm_out_cuda : public at::meta::structured_addmm { +void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, const at::Tensor & out); +}; +TORCH_API at::Tensor addmm_sparse_dense_cpu(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addmm_out_sparse_dense_cpu(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & s_addmm_sparse_dense_cpu_(at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor addmm_sparse_dense_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addmm_out_sparse_dense_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & s_addmm_sparse_dense_cuda_(at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor addmm_sparse_compressed_dense(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addmm_out_sparse_compressed_cpu(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & addmm_out_sparse_compressed_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor _addmm_dtype_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & _addmm_dtype_out_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ed7d4ff3f299962c10ebf42e4933f539e929e97b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_ops.h @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API addmm_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +}; + +struct TORCH_API addmm { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha); +}; + +struct TORCH_API addmm_dtype { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::ScalarType, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm"; + static constexpr const char* overload_name = "dtype"; + static constexpr const char* schema_str = "addmm.dtype(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha); +}; + +struct TORCH_API addmm_dtype_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::ScalarType, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm"; + static constexpr const char* overload_name = "dtype_out"; + static constexpr const char* schema_str = "addmm.dtype_out(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +}; + +struct TORCH_API addmm_ { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addmv_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addmv_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..8df48fed3202f123cf01cd991bd8ef64c9cf716b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/addmv_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_addmv : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/alias.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/alias.h new file mode 100644 index 0000000000000000000000000000000000000000..a4d24b44737f76521e94c3e2e27a9af561cc2741 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/alias.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::alias(Tensor(a) self) -> Tensor(a) +inline at::Tensor alias(const at::Tensor & self) { + return at::_ops::alias::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/alias_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/alias_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e9c495b26d15911dd26637d21b1f635857087bdc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/alias_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor alias(const at::Tensor & self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/amax_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/amax_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..9e4a8338013dc782cc220bc3cde8a4a8bc52e9b9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/amax_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_amax : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, at::IntArrayRef dim, bool keepdim); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/angle_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/angle_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4c96c857dd55c790e42882f0a76d9a7543872311 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/angle_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor angle(const at::Tensor & self); +TORCH_API at::Tensor & angle_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor angle_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & angle_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/angle_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/angle_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4209716da85f17a20a46b036558c0b6155d1d87f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/angle_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API angle { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::angle"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "angle(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API angle_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::angle"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/any_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/any_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..95a4a60040f70ffb84d3062a607f838d5921095c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/any_cuda_dispatch.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor any(const at::Tensor & self, int64_t dim, bool keepdim=false); +TORCH_API at::Tensor & any_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool keepdim=false); +TORCH_API at::Tensor & any_outf(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out); +TORCH_API at::Tensor any(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false); +TORCH_API at::Tensor & any_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false); +TORCH_API at::Tensor & any_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out); +TORCH_API at::Tensor any(const at::Tensor & self); +TORCH_API at::Tensor & any_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & any_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/arange_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/arange_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..24a34bbbf5a85cf9ecd432fd0dccc7eb27926988 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/arange_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & arange_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step); +TORCH_API at::Tensor & arange_outf(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/arccosh_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/arccosh_native.h new file mode 100644 index 0000000000000000000000000000000000000000..067c6ab35c331b610df9e8f6671280d54b9704b5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/arccosh_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor arccosh(const at::Tensor & self); +TORCH_API at::Tensor & arccosh_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & arccosh_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/arctanh_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/arctanh_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d98ecc829e952fb085603756920c6379c29c267a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/arctanh_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API arctanh { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::arctanh"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "arctanh(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API arctanh_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::arctanh_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "arctanh_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API arctanh_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::arctanh"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ae5e637068bf39a33a78767ca007532e941aa2e5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/argmin_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor argmin(const at::Tensor & self, ::std::optional dim=::std::nullopt, bool keepdim=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/argwhere.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/argwhere.h new file mode 100644 index 0000000000000000000000000000000000000000..f87588dadc89d5f7f4f09f9b68a076baa415d138 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/argwhere.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::argwhere(Tensor self) -> Tensor +inline at::Tensor argwhere(const at::Tensor & self) { + return at::_ops::argwhere::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b19138420ea8f4b1b7078d946a218fe700acc674 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API const at::Tensor & as_strided_(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt); +TORCH_API const at::Tensor & as_strided__symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_copy.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..0f43a394c8b55a4fd0cf577090e69510fe9a33f1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_copy.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor +inline at::Tensor as_strided_copy(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt); +} +namespace symint { + template >> + at::Tensor as_strided_copy(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt); + } +} + +// aten::as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor +inline at::Tensor as_strided_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy::call(self, size, stride, storage_offset); +} +namespace symint { + template >> + at::Tensor as_strided_copy(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy::call(self, size, stride, storage_offset); + } +} + +// aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & as_strided_copy_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy_out::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & as_strided_copy_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy_out::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt, out); + } +} + +// aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & as_strided_copy_outf(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset, at::Tensor & out) { + return at::_ops::as_strided_copy_out::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & as_strided_copy_outf(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset, at::Tensor & out) { + return at::_ops::as_strided_copy_out::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt, out); + } +} + +// aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & as_strided_copy_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy_out::call(self, size, stride, storage_offset, out); +} +namespace symint { + template >> + at::Tensor & as_strided_copy_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy_out::call(self, size, stride, storage_offset, out); + } +} + +// aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & as_strided_copy_symint_outf(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset, at::Tensor & out) { + return at::_ops::as_strided_copy_out::call(self, size, stride, storage_offset, out); +} +namespace symint { + template >> + at::Tensor & as_strided_copy_outf(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset, at::Tensor & out) { + return at::_ops::as_strided_copy_out::call(self, size, stride, storage_offset, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_scatter_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_scatter_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6cb5a721773e522b779c423db8b803e16c954472 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_scatter_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor as_strided_scatter(const at::Tensor & self, const at::Tensor & src, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt); +TORCH_API at::Tensor as_strided_scatter_symint(const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/asinh_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/asinh_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4e435b6809166ba60a54cf2c8635c06e069c8f52 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/asinh_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor asinh(const at::Tensor & self); +TORCH_API at::Tensor & asinh_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & asinh_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & asinh_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/asinh_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/asinh_native.h new file mode 100644 index 0000000000000000000000000000000000000000..89d10c4042914f63caa58725feb42d7888fa5c44 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/asinh_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_asinh_out : public at::meta::structured_asinh { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +TORCH_API at::Tensor asinh_sparse(const at::Tensor & self); +TORCH_API at::Tensor & asinh_sparse_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & asinh_sparse_(at::Tensor & self); +TORCH_API at::Tensor asinh_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & asinh_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & asinh_sparse_csr_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atan.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atan.h new file mode 100644 index 0000000000000000000000000000000000000000..bfae5e8938a1835ef501168a380862b905f47eb7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atan.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::atan(Tensor self) -> Tensor +inline at::Tensor atan(const at::Tensor & self) { + return at::_ops::atan::call(self); +} + +// aten::atan_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & atan_(at::Tensor & self) { + return at::_ops::atan_::call(self); +} + +// aten::atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & atan_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::atan_out::call(self, out); +} +// aten::atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & atan_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::atan_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atan2_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atan2_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..feb3c44aeb4aa2b171bd9cff18f1290f72285949 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atan2_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_atan2 : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atan_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atan_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c993cad67f9d1b95fa141f1ad60b181370c12c1e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atan_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API atan { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::atan"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "atan(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API atan_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::atan_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "atan_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API atan_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::atan"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atanh_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atanh_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..92f62ded996c33504b4a68f8084d7cfb82308ec8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atanh_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_atanh : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_2d_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_2d_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..deaf7b671db6d033a68111ea87f0daa367d468c6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_2d_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor atleast_2d(const at::Tensor & self); +TORCH_API ::std::vector atleast_2d(at::TensorList tensors); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d.h new file mode 100644 index 0000000000000000000000000000000000000000..f7671ca6bd65381c8d6e4d42988a36bedd446189 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt) { + return at::_ops::avg_pool2d_out::call(self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, out); +} +// aten::avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override, at::Tensor & out) { + return at::_ops::avg_pool2d_out::call(self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, out); +} + +// aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor +inline at::Tensor avg_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt) { + return at::_ops::avg_pool2d::call(self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f29db18d0f31e0a14aa7503f090d29ea14446817 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor avg_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..335df8354423966ceea626bfdfdadc7c786ad641 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor avg_pool3d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool3d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3802fa1a0adda9a5e452dd11c46d5bbe49b4df53 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor avg_pool3d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool3d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/baddbmm_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/baddbmm_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..552b076438f45350ae16e38f0294c6aab07823bf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/baddbmm_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_baddbmm : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_backward_elemt.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_backward_elemt.h new file mode 100644 index 0000000000000000000000000000000000000000..320882853e8d17c518baf8367da5fd4b8cc2a96c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_backward_elemt.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::batch_norm_backward_elemt(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count) -> Tensor +inline at::Tensor batch_norm_backward_elemt(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const ::std::optional & weight, const at::Tensor & sum_dy, const at::Tensor & sum_dy_xmu, const at::Tensor & count) { + return at::_ops::batch_norm_backward_elemt::call(grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count); +} + +// aten::batch_norm_backward_elemt.out(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & batch_norm_backward_elemt_out(at::Tensor & out, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const ::std::optional & weight, const at::Tensor & sum_dy, const at::Tensor & sum_dy_xmu, const at::Tensor & count) { + return at::_ops::batch_norm_backward_elemt_out::call(grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count, out); +} +// aten::batch_norm_backward_elemt.out(Tensor grad_out, Tensor input, Tensor mean, Tensor invstd, Tensor? weight, Tensor sum_dy, Tensor sum_dy_xmu, Tensor count, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & batch_norm_backward_elemt_outf(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const ::std::optional & weight, const at::Tensor & sum_dy, const at::Tensor & sum_dy_xmu, const at::Tensor & count, at::Tensor & out) { + return at::_ops::batch_norm_backward_elemt_out::call(grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu, count, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9d711448056ebf6a2f7a8aca378ad498a2cfe6f7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _new_batch_norm_backward_cpu(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & running_mean, const ::std::optional & running_var, const ::std::optional & save_mean, const ::std::optional & save_var, bool update, double eps, ::std::array output_mask, const at::Tensor & reserve); +TORCH_API ::std::tuple _new_batch_norm_backward_cuda(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & running_mean, const ::std::optional & running_var, const ::std::optional & save_mean, const ::std::optional & save_var, bool update, double eps, ::std::array output_mask, const at::Tensor & reserve); +TORCH_API ::std::tuple _new_batch_norm_backward_mkldnn(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & running_mean, const ::std::optional & running_var, const ::std::optional & save_mean, const ::std::optional & save_var, bool update, double eps, ::std::array output_mask, const at::Tensor & reserve); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..47d50a3a94b7846bd188c146f28221d1d8b269de --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor batch_norm_elemt(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps); +TORCH_API at::Tensor & batch_norm_elemt_out(at::Tensor & out, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps); +TORCH_API at::Tensor & batch_norm_elemt_outf(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_stats.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_stats.h new file mode 100644 index 0000000000000000000000000000000000000000..79e893c069b8941c574ca9b194cad6cbcbcf1d08 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_stats.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor) +inline ::std::tuple batch_norm_stats(const at::Tensor & input, double eps) { + return at::_ops::batch_norm_stats::call(input, eps); +} + +// aten::batch_norm_stats.out(Tensor input, float eps, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple batch_norm_stats_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & input, double eps) { + return at::_ops::batch_norm_stats_out::call(input, eps, out0, out1); +} +// aten::batch_norm_stats.out(Tensor input, float eps, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple batch_norm_stats_outf(const at::Tensor & input, double eps, at::Tensor & out0, at::Tensor & out1) { + return at::_ops::batch_norm_stats_out::call(input, eps, out0, out1); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_update_stats_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_update_stats_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..be0c358b5875dbf6defce627c15fe8acd2881d4b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_update_stats_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple batch_norm_update_stats_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & input, const ::std::optional & running_mean, const ::std::optional & running_var, double momentum); +TORCH_API ::std::tuple batch_norm_update_stats_outf(const at::Tensor & input, const ::std::optional & running_mean, const ::std::optional & running_var, double momentum, at::Tensor & out0, at::Tensor & out1); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..def15556239972f5e1613226ba291b8c30d17907 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor +inline at::Tensor binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight={}, int64_t reduction=at::Reduction::Mean) { + return at::_ops::binary_cross_entropy_backward::call(grad_output, self, target, weight, reduction); +} + +// aten::binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & binary_cross_entropy_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight={}, int64_t reduction=at::Reduction::Mean) { + return at::_ops::binary_cross_entropy_backward_grad_input::call(grad_output, self, target, weight, reduction, grad_input); +} +// aten::binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & binary_cross_entropy_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, at::Tensor & grad_input) { + return at::_ops::binary_cross_entropy_backward_grad_input::call(grad_output, self, target, weight, reduction, grad_input); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fb05703b0b97efbd62a36f1bbac614d5d8b3ec3c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API binary_cross_entropy_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::binary_cross_entropy_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction); +}; + +struct TORCH_API binary_cross_entropy_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::binary_cross_entropy_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, at::Tensor & grad_input); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binomial.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binomial.h new file mode 100644 index 0000000000000000000000000000000000000000..34eeaa547aed60de7dcee2a0909e64c5d9391dfa --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binomial.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor +inline at::Tensor binomial(const at::Tensor & count, const at::Tensor & prob, ::std::optional generator=::std::nullopt) { + return at::_ops::binomial::call(count, prob, generator); +} + +// aten::binomial.out(Tensor count, Tensor prob, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & binomial_out(at::Tensor & out, const at::Tensor & count, const at::Tensor & prob, ::std::optional generator=::std::nullopt) { + return at::_ops::binomial_out::call(count, prob, generator, out); +} +// aten::binomial.out(Tensor count, Tensor prob, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & binomial_outf(const at::Tensor & count, const at::Tensor & prob, ::std::optional generator, at::Tensor & out) { + return at::_ops::binomial_out::call(count, prob, generator, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..70b0df8ec385b40c466d9a55e98cef1da5f3432e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor binomial(const at::Tensor & count, const at::Tensor & prob, ::std::optional generator=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..879ea35fe4532747ef81d3b4fac7171820fe47cb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor binomial(const at::Tensor & count, const at::Tensor & prob, ::std::optional generator=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7c2c6d5c7a73ba0af4b4940e07287374da1b1367 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor bitwise_not(const at::Tensor & self); +TORCH_API at::Tensor & bitwise_not_(at::Tensor & self); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..93d0456a3071dbee9ca22831a46d34ce89bb4c01 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_or_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor bitwise_or(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & bitwise_or_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & bitwise_or_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & bitwise_or_(at::Tensor & self, const at::Tensor & other); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..906ac525b4a514d85b1db3bc63b51dd8439c2edb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor bmm(const at::Tensor & self, const at::Tensor & mat2); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_tensors_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_tensors_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7aed549c8117ecaaa2b77968a9d812c3de540751 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_tensors_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector broadcast_tensors(at::TensorList tensors); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_to_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_to_native.h new file mode 100644 index 0000000000000000000000000000000000000000..60c36fda07c66b7660ae24145ad26f8aca634fdd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_to_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor broadcast_to_symint(const at::Tensor & self, c10::SymIntArrayRef size); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cartesian_prod_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cartesian_prod_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2601175e5daf21d0fbc45cb281955763c9a5b7dc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cartesian_prod_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor cartesian_prod(at::TensorList tensors); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cat_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cat_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..852b4aea82296da8ffc1c57286fb7f379e2b0e6d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cat_meta.h @@ -0,0 +1,119 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_cat : public at::impl::MetaBase { + + template + struct TORCH_API precompute_out { + + precompute_out set_dim(int64_t value) { + static_assert(DIM == false, "dim already set"); + precompute_out ret; +ret.dim = value; +ret.valid = this->valid; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_valid(int64_t value) { + static_assert(VALID == false, "valid already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = value; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_all_contiguous(bool value) { + static_assert(ALL_CONTIGUOUS == false, "all_contiguous already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = this->valid; +ret.all_contiguous = value; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_all_same_dtype(bool value) { + static_assert(ALL_SAME_DTYPE == false, "all_same_dtype already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = this->valid; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = value; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_all_same_sizes_and_stride(bool value) { + static_assert(ALL_SAME_SIZES_AND_STRIDE == false, "all_same_sizes_and_stride already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = this->valid; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = value; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_memory_format(at::MemoryFormat value) { + static_assert(MEMORY_FORMAT == false, "memory_format already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = this->valid; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = value; +return ret; + } + + int64_t dim; +int64_t valid; +bool all_contiguous; +bool all_same_dtype; +bool all_same_sizes_and_stride; +at::MemoryFormat memory_format; + }; + using meta_return_ty = precompute_out ; + meta_return_ty meta(const at::ITensorListRef & tensors, int64_t dim); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cdist.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cdist.h new file mode 100644 index 0000000000000000000000000000000000000000..1d325816116d92f7cd994086b6f69c614d83f4a1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cdist.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor +inline at::Tensor cdist(const at::Tensor & x1, const at::Tensor & x2, double p=2, ::std::optional compute_mode=::std::nullopt) { + return at::_ops::cdist::call(x1, x2, p, compute_mode); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ceil_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ceil_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..32037c8f71fb650389bcdabc1b7018ed1720bed8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ceil_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor ceil(const at::Tensor & self); +TORCH_API at::Tensor & ceil_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & ceil_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & ceil_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/chalf.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/chalf.h new file mode 100644 index 0000000000000000000000000000000000000000..f7d0d155a0e9732e27907b419efab2425d41f6ac --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/chalf.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4f231628d44751d96914a54cf1f6166708edc772 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector chunk(const at::Tensor & self, int64_t chunks, int64_t dim=0); +TORCH_API ::std::vector chunk_nested_tensor(const at::Tensor & self, int64_t chunks, int64_t dim=0); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..272a44791d5ac1b585d6767eb4bf08b5952e35df --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API chunk { + using schema = ::std::vector (const at::Tensor &, int64_t, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::chunk"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "chunk(Tensor(a -> *) self, int chunks, int dim=0) -> Tensor(a)[]"; + static ::std::vector call(const at::Tensor & self, int64_t chunks, int64_t dim); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t chunks, int64_t dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4be8931cb0789bd843e6a591724c60a44285a325 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor clamp(const at::Tensor & self, const ::std::optional & min, const ::std::optional & max=::std::nullopt); +TORCH_API at::Tensor & clamp_(at::Tensor & self, const ::std::optional & min, const ::std::optional & max=::std::nullopt); +TORCH_API at::Tensor clamp(const at::Tensor & self, const ::std::optional & min={}, const ::std::optional & max={}); +TORCH_API at::Tensor & clamp_(at::Tensor & self, const ::std::optional & min={}, const ::std::optional & max={}); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_max_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_max_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..73ab1130cf7ef1a2a7639403ea4acff48154510a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_max_cpu_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor clamp_max(const at::Tensor & self, const at::Scalar & max); +TORCH_API at::Tensor & clamp_max_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & max); +TORCH_API at::Tensor & clamp_max_outf(const at::Tensor & self, const at::Scalar & max, at::Tensor & out); +TORCH_API at::Tensor & clamp_max_(at::Tensor & self, const at::Scalar & max); +TORCH_API at::Tensor clamp_max(const at::Tensor & self, const at::Tensor & max); +TORCH_API at::Tensor & clamp_max_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & max); +TORCH_API at::Tensor & clamp_max_outf(const at::Tensor & self, const at::Tensor & max, at::Tensor & out); +TORCH_API at::Tensor & clamp_max_(at::Tensor & self, const at::Tensor & max); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/concat_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/concat_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7fa32c1c64f5659a5fe2396c97288ff9abb2a5cc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/concat_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor concat(at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & concat_out(at::Tensor & out, at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & concat_outf(at::TensorList tensors, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor concat(at::TensorList tensors, at::Dimname dim); +TORCH_API at::Tensor & concat_out(at::Tensor & out, at::TensorList tensors, at::Dimname dim); +TORCH_API at::Tensor & concat_outf(at::TensorList tensors, at::Dimname dim, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conj_physical_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conj_physical_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e8c861010d8435f7000396fb6e8969c11aff8dc7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conj_physical_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & conj_physical_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & conj_physical_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/constant_pad_nd_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/constant_pad_nd_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3d897bb7e4c9aa4a1155e0dea446261817a0d433 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/constant_pad_nd_compositeexplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor constant_pad_nd(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value=0); +TORCH_API at::Tensor constant_pad_nd_symint(const at::Tensor & self, c10::SymIntArrayRef pad, const at::Scalar & value=0); +TORCH_API at::Tensor & constant_pad_nd_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value=0); +TORCH_API at::Tensor & constant_pad_nd_outf(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value, at::Tensor & out); +TORCH_API at::Tensor & constant_pad_nd_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef pad, const at::Scalar & value=0); +TORCH_API at::Tensor & constant_pad_nd_symint_outf(const at::Tensor & self, c10::SymIntArrayRef pad, const at::Scalar & value, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_depthwise3d_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_depthwise3d_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0e2a61f11b9a6461d67dd4ea3aa58b2fc02ddd0a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_depthwise3d_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & conv_depthwise3d_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation); +TORCH_API at::Tensor & conv_depthwise3d_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, at::Tensor & out); +TORCH_API at::Tensor & conv_depthwise3d_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation); +TORCH_API at::Tensor & conv_depthwise3d_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d.h new file mode 100644 index 0000000000000000000000000000000000000000..04ef9e280238a8ab81655cc34eedade4f672e1de --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor +inline at::Tensor conv_transpose2d(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, int64_t groups=1, at::IntArrayRef dilation=1) { + return at::_ops::conv_transpose2d_input::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), groups, c10::fromIntArrayRefSlow(dilation)); +} +namespace symint { + template >> + at::Tensor conv_transpose2d(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, int64_t groups=1, at::IntArrayRef dilation=1) { + return at::_ops::conv_transpose2d_input::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), groups, c10::fromIntArrayRefSlow(dilation)); + } +} + +// aten::conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor +inline at::Tensor conv_transpose2d_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)) { + return at::_ops::conv_transpose2d_input::call(input, weight, bias, stride, padding, output_padding, groups, dilation); +} +namespace symint { + template >> + at::Tensor conv_transpose2d(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)) { + return at::_ops::conv_transpose2d_input::call(input, weight, bias, stride, padding, output_padding, groups, dilation); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d4d2151ff261579f164d7b0c872f587e0e34c5e0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API conv_transpose2d_input { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::conv_transpose2d"; + static constexpr const char* overload_name = "input"; + static constexpr const char* schema_str = "conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymInt groups, c10::SymIntArrayRef dilation); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymInt groups, c10::SymIntArrayRef dilation); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose3d_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose3d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d7f39759ec774528bb7cfc82f2d8d17eafaa692d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose3d_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor conv_transpose3d_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/convolution_overrideable_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/convolution_overrideable_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4be5a335445b01c6a7dafe761006f45b9f1096fc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/convolution_overrideable_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API convolution_overrideable { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, c10::SymIntArrayRef, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::convolution_overrideable"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups); +}; + +struct TORCH_API convolution_overrideable_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, c10::SymIntArrayRef, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::convolution_overrideable"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "convolution_overrideable.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cosh_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cosh_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..283aefb828d046a47e74d46f0efabf553b1b27dd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cosh_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor cosh(const at::Tensor & self); +TORCH_API at::Tensor & cosh_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & cosh_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & cosh_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/count_nonzero_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/count_nonzero_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f3aa9a7ff1a5670a6a294a8e47f228e8d376d0b3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/count_nonzero_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & count_nonzero_dim_IntList_out(const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out); +TORCH_API at::Tensor count_nonzero_cpu(const at::Tensor & self, at::IntArrayRef dim); +TORCH_API at::Tensor count_nonzero_cuda(const at::Tensor & self, at::IntArrayRef dim); +TORCH_API at::Tensor count_nonzero(const at::Tensor & self, ::std::optional dim=::std::nullopt); +TORCH_API at::Tensor & count_nonzero_out(const at::Tensor & self, ::std::optional dim, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cov_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cov_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ead826c68fb4c4c9c4ccd5263316d5ba318c142a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cov_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API cov { + using schema = at::Tensor (const at::Tensor &, int64_t, const ::std::optional &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cov"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t correction, const ::std::optional & fweights, const ::std::optional & aweights); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t correction, const ::std::optional & fweights, const ::std::optional & aweights); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cross_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cross_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fd531409d7e209675ef4df47a7a8eb071adb51bd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cross_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API cross_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cross"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, ::std::optional dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, ::std::optional dim, at::Tensor & out); +}; + +struct TORCH_API cross { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cross"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "cross(Tensor self, Tensor other, int? dim=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other, ::std::optional dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, ::std::optional dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_relu_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_relu_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3a54c0f6c3734615a2acf372d0fc0f62d22feaad --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_relu_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API cudnn_convolution_relu { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cudnn_convolution_relu"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); +}; + +struct TORCH_API cudnn_convolution_relu_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cudnn_convolution_relu"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "cudnn_convolution_relu.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..43302599583ecf763fc64465a9d0f9c689e4b320 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & cudnn_convolution_transpose_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32); +TORCH_API at::Tensor & cudnn_convolution_transpose_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out); +TORCH_API at::Tensor & cudnn_convolution_transpose_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32); +TORCH_API at::Tensor & cudnn_convolution_transpose_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d856e4d1a6ed6f69b6755453de6b39c82f08ad97 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor cudnn_grid_sampler(const at::Tensor & self, const at::Tensor & grid); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..5c31dde88c8cc6ff08ef57849a8b91fa994a07dc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor cumprod_backward(const at::Tensor & grad, const at::Tensor & input, int64_t dim, const at::Tensor & output); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cumsum_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cumsum_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..797bc38de27d58b2993f967170855688dc7283ec --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/cumsum_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor cumsum(const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & cumsum_out(at::Tensor & out, const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & cumsum_outf(const at::Tensor & self, int64_t dim, ::std::optional dtype, at::Tensor & out); +TORCH_API at::Tensor & cumsum_(at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6beba2593ea090940097edc099973de86f429354 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor deg2rad(const at::Tensor & self); +TORCH_API at::Tensor & deg2rad_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & deg2rad_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & deg2rad_(at::Tensor & self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ac9bda3e7365dac016a04475e29a4b5548c8e10d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor deg2rad(const at::Tensor & self); +TORCH_API at::Tensor & deg2rad_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & deg2rad_(at::Tensor & self); +TORCH_API at::Tensor deg2rad_sparse(const at::Tensor & self); +TORCH_API at::Tensor & deg2rad_sparse_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & deg2rad_sparse_(at::Tensor & self); +TORCH_API at::Tensor deg2rad_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & deg2rad_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & deg2rad_sparse_csr_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/dense_dim_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/dense_dim_native.h new file mode 100644 index 0000000000000000000000000000000000000000..800e91b31c537c654ddbe8391e8b2a688049d1bd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/dense_dim_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API int64_t dense_dim_default(const at::Tensor & self); +TORCH_API int64_t dense_dim_sparse(const at::Tensor & self); +TORCH_API int64_t dense_dim_sparse_csr(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/detach_copy_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/detach_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0acd2c8da3462a7e359f9c58ab13e33c14f8aea6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/detach_copy_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & detach_copy_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor detach_copy(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/diag_embed.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/diag_embed.h new file mode 100644 index 0000000000000000000000000000000000000000..f6e6ead87426a7e103aa9440c25bd22d81361d07 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/diag_embed.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor +inline at::Tensor diag_embed(const at::Tensor & self, int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1) { + return at::_ops::diag_embed::call(self, offset, dim1, dim2); +} + +// aten::diag_embed.out(Tensor self, int offset=0, int dim1=-2, int dim2=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & diag_embed_out(at::Tensor & out, const at::Tensor & self, int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1) { + return at::_ops::diag_embed_out::call(self, offset, dim1, dim2, out); +} +// aten::diag_embed.out(Tensor self, int offset=0, int dim1=-2, int dim2=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & diag_embed_outf(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out) { + return at::_ops::diag_embed_out::call(self, offset, dim1, dim2, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/diagflat_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/diagflat_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d7f089c31b9a7a2bc910787e1e2a11b9f3ce16a6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/diagflat_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor diagflat(const at::Tensor & self, int64_t offset=0); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9d287b0ffa62abb12ab9e9e954626a1b88c4e4f4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor diagonal(const at::Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1); +TORCH_API at::Tensor diagonal(const at::Tensor & self, at::Dimname outdim, at::Dimname dim1, at::Dimname dim2, int64_t offset=0); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/dist_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/dist_native.h new file mode 100644 index 0000000000000000000000000000000000000000..18f35fb315fed3ccb409103b56e218bbf99c1b90 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/dist_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor dist(const at::Tensor & self, const at::Tensor & other, const at::Scalar & p=2); +TORCH_API at::Tensor & dist_out(const at::Tensor & self, const at::Tensor & other, const at::Scalar & p, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/dot.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/dot.h new file mode 100644 index 0000000000000000000000000000000000000000..6ea8c35534d0902371f8f13362fee421c293bca0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/dot.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::dot(Tensor self, Tensor tensor) -> Tensor +inline at::Tensor dot(const at::Tensor & self, const at::Tensor & tensor) { + return at::_ops::dot::call(self, tensor); +} + +// aten::dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & dot_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor) { + return at::_ops::dot_out::call(self, tensor, out); +} +// aten::dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & dot_outf(const at::Tensor & self, const at::Tensor & tensor, at::Tensor & out) { + return at::_ops::dot_out::call(self, tensor, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/elu_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/elu_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a6ea1a9b60c8f126f9c793f0e4f64049c550bd9b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/elu_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor elu(const at::Tensor & self, const at::Scalar & alpha=1, const at::Scalar & scale=1, const at::Scalar & input_scale=1); +TORCH_API at::Tensor & elu_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & alpha=1, const at::Scalar & scale=1, const at::Scalar & input_scale=1); +TORCH_API at::Tensor & elu_outf(const at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, at::Tensor & out); +TORCH_API at::Tensor & elu_(at::Tensor & self, const at::Scalar & alpha=1, const at::Scalar & scale=1, const at::Scalar & input_scale=1); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_bag_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_bag_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cd819d0be0b7e71b3ffedf44dfb23aeab1a81a74 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_bag_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API ::std::tuple embedding_bag(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const ::std::optional & per_sample_weights={}, bool include_last_offset=false); +TORCH_API ::std::tuple embedding_bag(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const ::std::optional & per_sample_weights, bool include_last_offset, ::std::optional padding_idx); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..21713e5d3a9f9fd9dcd69e96c59494e5f26c2e61 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_compositeexplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false); +TORCH_API at::Tensor embedding_symint(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false); +TORCH_API at::Tensor & embedding_out(at::Tensor & out, const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false); +TORCH_API at::Tensor & embedding_outf(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out); +TORCH_API at::Tensor & embedding_symint_out(at::Tensor & out, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false); +TORCH_API at::Tensor & embedding_symint_outf(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3d2b5e43f834809d27a3eac122abc30aab84e672 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor embedding_dense_backward(const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq); +TORCH_API at::Tensor embedding_dense_backward_symint(const at::Tensor & grad_output, const at::Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..bc92ebcd6a0def6aec7ebbe1ab423c50f147274d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor embedding_dense_backward(const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq); +TORCH_API at::Tensor embedding_dense_backward_symint(const at::Tensor & grad_output, const at::Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f9d815f19182a50598b9f9bee08feb6948b5ada6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API embedding { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::SymInt, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::embedding"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor"; + static at::Tensor call(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse); +}; + +struct TORCH_API embedding_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::SymInt, bool, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::embedding"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "embedding.out(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/empty_like.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/empty_like.h new file mode 100644 index 0000000000000000000000000000000000000000..7b8982b69d3496f5ddc131277b4b8397f4fef01c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/empty_like.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor empty_like(const at::Tensor & self, at::TensorOptions options={}, ::std::optional memory_format=::std::nullopt) { + return at::_ops::empty_like::call(self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); +} +// aten::empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor empty_like(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format) { + return at::_ops::empty_like::call(self, dtype, layout, device, pin_memory, memory_format); +} + +// aten::empty_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & empty_like_out(at::Tensor & out, const at::Tensor & self, ::std::optional memory_format=::std::nullopt) { + return at::_ops::empty_like_out::call(self, memory_format, out); +} +// aten::empty_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & empty_like_outf(const at::Tensor & self, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::empty_like_out::call(self, memory_format, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e7a620bbb4388e3bd07d3394374fd62805f5ce84 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/empty_strided_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & empty_strided_out_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out); +TORCH_API at::Tensor empty_strided_cpu(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor empty_strided_cuda(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor empty_strided_meta_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor empty_strided_unknown_quantized(at::IntArrayRef size, at::IntArrayRef stride, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/equal_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/equal_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6e3a573c421272df7801a2e543af7258f83f30d5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/equal_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API bool equal(const at::Tensor & self, const at::Tensor & other); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/erfinv_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/erfinv_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..276fa4868e6f9b2e8e314a16a0d9880d5c1124ce --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/erfinv_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_erfinv : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/exp2_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/exp2_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4cbebe66e8780063a26bc3a24adc53dde7e7eac9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/exp2_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor exp2(const at::Tensor & self); +TORCH_API at::Tensor & exp2_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & exp2_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & exp2_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/exp_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/exp_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ce09b1be18ae3e7d45413c0b99613a333a1b5549 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/exp_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor exp(const at::Tensor & self); +TORCH_API at::Tensor & exp_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & exp_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & exp_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/exp_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/exp_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..af60ecafab90e9221ecc1bcf37d99840026c7cdb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/exp_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor exp(const at::Tensor & self); +TORCH_API at::Tensor & exp_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & exp_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & exp_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/expand_copy_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/expand_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..581df8040dd9513700019cee8f345edd61652336 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/expand_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API expand_copy { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::expand_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor"; + static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, bool implicit); +}; + +struct TORCH_API expand_copy_out { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::expand_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, bool implicit, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/expm1_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/expm1_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0964acdd234bfb461002aa5e2a762c87aea08e35 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/expm1_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor expm1(const at::Tensor & self); +TORCH_API at::Tensor & expm1_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & expm1_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & expm1_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/eye_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/eye_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..5c1b7766cbef20c36b7d84532c8eb3ca2d2e9ba2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/eye_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API eye { + using schema = at::Tensor (c10::SymInt, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::eye"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "eye(SymInt n, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(c10::SymInt n, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API eye_m { + using schema = at::Tensor (c10::SymInt, c10::SymInt, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::eye"; + static constexpr const char* overload_name = "m"; + static constexpr const char* schema_str = "eye.m(SymInt n, SymInt m, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(c10::SymInt n, c10::SymInt m, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::SymInt m, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API eye_out { + using schema = at::Tensor & (c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::eye"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "eye.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(c10::SymInt n, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, at::Tensor & out); +}; + +struct TORCH_API eye_m_out { + using schema = at::Tensor & (c10::SymInt, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::eye"; + static constexpr const char* overload_name = "m_out"; + static constexpr const char* schema_str = "eye.m_out(SymInt n, SymInt m, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(c10::SymInt n, c10::SymInt m, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymInt n, c10::SymInt m, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ff4058232fb05b34ede10e0fab94fe5fd9eddf26 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fake_quantize_per_tensor_affine_cachemask_backward(const at::Tensor & grad, const at::Tensor & mask); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8d2f3ddfb40da53685803a233d3f4dedc119694a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fake_quantize_per_tensor_affine_cachemask_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fake_quantize_per_tensor_affine_cachemask_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor"; + static at::Tensor call(const at::Tensor & grad, const at::Tensor & mask); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & mask); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_native.h new file mode 100644 index 0000000000000000000000000000000000000000..28d81d24da0e4de38ed313f2557adeccb4cfccc5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor fake_quantize_per_tensor_affine(const at::Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max); +TORCH_API at::Tensor fake_quantize_per_tensor_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..86e2fd43781caf736111ac9d4d0fd8a6e7003cb6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fbgemm_linear_fp16_weight_fp32_activation(const at::Tensor & input, const at::Tensor & packed_weight, const ::std::optional & bias); +TORCH_API at::Tensor fbgemm_linear_fp16_weight_fp32_activation(const at::Tensor & input, const at::Tensor & packed_weight, const ::std::optional & bias, at::Tensor & output); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c2ad51544fb2f90b36625760d19445bf7bfc6552 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fbgemm_linear_int8_weight(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & packed, const at::Tensor & col_offsets, const at::Scalar & weight_scale, const at::Scalar & weight_zero_point, const at::Tensor & bias); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_fp32_activation.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_fp32_activation.h new file mode 100644 index 0000000000000000000000000000000000000000..3c47277ef771aef0358daa7b496992f26be68f83 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_fp32_activation.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor +inline at::Tensor fbgemm_linear_int8_weight_fp32_activation(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & packed, const at::Tensor & col_offsets, const at::Scalar & weight_scale, const at::Scalar & weight_zero_point, const at::Tensor & bias) { + return at::_ops::fbgemm_linear_int8_weight_fp32_activation::call(input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d948c208033b013bc7e524a59e6b280b588d12b2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fbgemm_pack_gemm_matrix_fp16 { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fbgemm_pack_gemm_matrix_fp16"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor"; + static at::Tensor call(const at::Tensor & input); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/feature_dropout_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/feature_dropout_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..958d13e61d6650d02a6df5cf68e62b46c852ef19 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/feature_dropout_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API feature_dropout { + using schema = at::Tensor (const at::Tensor &, double, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::feature_dropout"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "feature_dropout(Tensor input, float p, bool train) -> Tensor"; + static at::Tensor call(const at::Tensor & input, double p, bool train); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, bool train); +}; + +struct TORCH_API feature_dropout_ { + using schema = at::Tensor & (at::Tensor &, double, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::feature_dropout_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, double p, bool train); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p, bool train); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft2_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft2_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c73c50b315a4081638a84b92913dec1ea184cb9d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft2_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor fft_fft2_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_fft2_symint_out(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fftn_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fftn_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..06b58feb62e0038453706aea1091383f77f3b9c3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fftn_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fft_fftn { + using schema = at::Tensor (const at::Tensor &, at::OptionalSymIntArrayRef, at::OptionalIntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fft_fftn"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fft_fftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, ::std::optional norm); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, ::std::optional norm); +}; + +struct TORCH_API fft_fftn_out { + using schema = at::Tensor & (const at::Tensor &, at::OptionalSymIntArrayRef, at::OptionalIntArrayRef, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fft_fftn"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "fft_fftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, ::std::optional norm, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, ::std::optional norm, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fftshift_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fftshift_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ff6019ee718a61dbc339696f9eb50548a853740a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fftshift_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor fft_fftshift(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ihfft.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ihfft.h new file mode 100644 index 0000000000000000000000000000000000000000..345381b77a6610c26760398ff51de5ceb4d0fc9d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ihfft.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor +inline at::Tensor fft_ihfft(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_ihfft::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm); +} +namespace symint { + template >> + at::Tensor fft_ihfft(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_ihfft::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm); + } +} + +// aten::fft_ihfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor +inline at::Tensor fft_ihfft_symint(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_ihfft::call(self, n, dim, norm); +} +namespace symint { + template >> + at::Tensor fft_ihfft(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_ihfft::call(self, n, dim, norm); + } +} + +// aten::fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_ihfft_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_ihfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_ihfft_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_ihfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out); + } +} + +// aten::fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_ihfft_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_ihfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_ihfft_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_ihfft_out::call(self, n.has_value() ? ::std::make_optional(c10::SymInt(*n)) : ::std::nullopt, dim, norm, out); + } +} + +// aten::fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_ihfft_symint_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_ihfft_out::call(self, n, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_ihfft_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_ihfft_out::call(self, n, dim, norm, out); + } +} + +// aten::fft_ihfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_ihfft_symint_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_ihfft_out::call(self, n, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_ihfft_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_ihfft_out::call(self, n, dim, norm, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ihfft_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ihfft_native.h new file mode 100644 index 0000000000000000000000000000000000000000..687eb87157d946e6002d479e7c73ff81eed39d48 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ihfft_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor fft_ihfft_symint(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_ihfft_symint_out(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e03c4034fc8e6c0a155d4ba3ee9aef3430d01122 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fft_irfft2(const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor fft_irfft2_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_irfft2_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_irfft2_outf(const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); +TORCH_API at::Tensor & fft_irfft2_symint_out(at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_irfft2_symint_outf(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fill_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fill_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9ff141d8192bbd0d7afc50ea6149896cc4547d99 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fill_compositeexplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor fill(const at::Tensor & self, const at::Scalar & value); +TORCH_API at::Tensor & fill_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & value); +TORCH_API at::Tensor & fill_outf(const at::Tensor & self, const at::Scalar & value, at::Tensor & out); +TORCH_API at::Tensor fill(const at::Tensor & self, const at::Tensor & value); +TORCH_API at::Tensor & fill_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & value); +TORCH_API at::Tensor & fill_outf(const at::Tensor & self, const at::Tensor & value, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fill_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fill_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2a178007ae0bef0a0d7d4fe86c03ee38b4570cfc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fill_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fill_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "fill.Scalar(Tensor self, Scalar value) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & value); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & value); +}; + +struct TORCH_API fill_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "fill.Tensor(Tensor self, Tensor value) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & value); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & value); +}; + +struct TORCH_API fill__Scalar { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & value); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & value); +}; + +struct TORCH_API fill__Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill_"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & value); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & value); +}; + +struct TORCH_API fill_Scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill"; + static constexpr const char* overload_name = "Scalar_out"; + static constexpr const char* schema_str = "fill.Scalar_out(Tensor self, Scalar value, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & value, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & value, at::Tensor & out); +}; + +struct TORCH_API fill_Tensor_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill"; + static constexpr const char* overload_name = "Tensor_out"; + static constexpr const char* schema_str = "fill.Tensor_out(Tensor self, Tensor value, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & value, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & value, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fix.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fix.h new file mode 100644 index 0000000000000000000000000000000000000000..3c917af164bc17888292e56ce03af3b138657cef --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fix.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fix(Tensor self) -> Tensor +inline at::Tensor fix(const at::Tensor & self) { + return at::_ops::fix::call(self); +} + +// aten::fix_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & fix_(at::Tensor & self) { + return at::_ops::fix_::call(self); +} + +// aten::fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fix_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::fix_out::call(self, out); +} +// aten::fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fix_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::fix_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/flatten.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/flatten.h new file mode 100644 index 0000000000000000000000000000000000000000..d89b63e12551068056994cec53f6a5db909d90ba --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/flatten.h @@ -0,0 +1,51 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a) +inline at::Tensor flatten(const at::Tensor & self, int64_t start_dim=0, int64_t end_dim=-1) { + return at::_ops::flatten_using_ints::call(self, start_dim, end_dim); +} + +// aten::flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a) +inline at::Tensor flatten(const at::Tensor & self, int64_t start_dim, int64_t end_dim, at::Dimname out_dim) { + return at::_ops::flatten_named_out_dim::call(self, start_dim, end_dim, out_dim); +} + +// aten::flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a) +inline at::Tensor flatten(const at::Tensor & self, at::Dimname start_dim, at::Dimname end_dim, at::Dimname out_dim) { + return at::_ops::flatten_using_names::call(self, start_dim, end_dim, out_dim); +} + +// aten::flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a) +inline at::Tensor flatten(const at::Tensor & self, at::DimnameList dims, at::Dimname out_dim) { + return at::_ops::flatten_DimnameList::call(self, dims, out_dim); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fliplr_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fliplr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..9996687ec8659611de6293a87efed24bace6d0ac --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fliplr_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fliplr { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fliplr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fliplr(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/flipud.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/flipud.h new file mode 100644 index 0000000000000000000000000000000000000000..69b8fe8be6a5f9937f0b386e65c0da324c079e25 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/flipud.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::flipud(Tensor self) -> Tensor +inline at::Tensor flipud(const at::Tensor & self) { + return at::_ops::flipud::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/floor_divide_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/floor_divide_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c0220f4f51ebac28c1f4182e807a0ecbe2a8e7da --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/floor_divide_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API floor_divide { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::floor_divide"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "floor_divide(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API floor_divide__Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::floor_divide_"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API floor_divide_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::floor_divide"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API floor_divide_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::floor_divide"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "floor_divide.Scalar(Tensor self, Scalar other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API floor_divide__Scalar { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::floor_divide_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API floor_divide_Scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::floor_divide"; + static constexpr const char* overload_name = "Scalar_out"; + static constexpr const char* schema_str = "floor_divide.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/floor_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/floor_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..85eac190169e8af81718122ae87133f354fcbd1c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/floor_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_floor : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fmin_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fmin_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ed6300a0310481d6ac8ad158d95eece9966416b6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fmin_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor fmin(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & fmin_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & fmin_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/frac_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/frac_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a08bed99a125b47ad617c45d1a24aed93db85366 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/frac_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor frac(const at::Tensor & self); +TORCH_API at::Tensor & frac_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & frac_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & frac_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2c6c37bb8c172d61fbf62b81dfcb569917dab01f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple fractional_max_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples); +TORCH_API ::std::tuple fractional_max_pool2d_out(at::Tensor & output, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples); +TORCH_API ::std::tuple fractional_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples, at::Tensor & output, at::Tensor & indices); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool3d_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool3d_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..b4e5766304ee387cbd823d5bfd42b162de8c8250 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool3d_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fractional_max_pool3d_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fractional_max_pool3d_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "fractional_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices, at::Tensor & grad_input); +}; + +struct TORCH_API fractional_max_pool3d_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fractional_max_pool3d_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/frexp.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/frexp.h new file mode 100644 index 0000000000000000000000000000000000000000..22b55f210854605a04cef310f1a2909da9e076fb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/frexp.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent) +inline ::std::tuple frexp(const at::Tensor & self) { + return at::_ops::frexp_Tensor::call(self); +} + +// aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent) +inline ::std::tuple frexp_out(at::Tensor & mantissa, at::Tensor & exponent, const at::Tensor & self) { + return at::_ops::frexp_Tensor_out::call(self, mantissa, exponent); +} +// aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent) +inline ::std::tuple frexp_outf(const at::Tensor & self, at::Tensor & mantissa, at::Tensor & exponent) { + return at::_ops::frexp_Tensor_out::call(self, mantissa, exponent); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/frexp_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/frexp_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..34605083b590d1af48cf60e14495ae282daa2f35 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/frexp_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple frexp_out(at::Tensor & mantissa, at::Tensor & exponent, const at::Tensor & self); +TORCH_API ::std::tuple frexp_outf(const at::Tensor & self, at::Tensor & mantissa, at::Tensor & exponent); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/full_like_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/full_like_native.h new file mode 100644 index 0000000000000000000000000000000000000000..27bda7264c156137ec6b5bf476f0e08846677145 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/full_like_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor full_like(const at::Tensor & self, const at::Scalar & fill_value, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor & full_like_out(const at::Tensor & self, const at::Scalar & fill_value, ::std::optional memory_format, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/full_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/full_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3e8ff00a112fb9bf29fb4cb357ca9dd4936f2df3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/full_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API full_names { + using schema = at::Tensor (at::IntArrayRef, const at::Scalar &, ::std::optional, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::full"; + static constexpr const char* overload_name = "names"; + static constexpr const char* schema_str = "full.names(int[] size, Scalar fill_value, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional names, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional names, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API full { + using schema = at::Tensor (c10::SymIntArrayRef, const at::Scalar &, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::full"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(c10::SymIntArrayRef size, const at::Scalar & fill_value, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, const at::Scalar & fill_value, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API full_out { + using schema = at::Tensor & (c10::SymIntArrayRef, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::full"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "full.out(SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out); +}; + +struct TORCH_API full_names_out { + using schema = at::Tensor & (at::IntArrayRef, const at::Scalar &, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::full"; + static constexpr const char* overload_name = "names_out"; + static constexpr const char* schema_str = "full.names_out(int[] size, Scalar fill_value, *, Dimname[]? names, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional names, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional names, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gather_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gather_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2938c5e62e0737ec7e0c9dcf46c57c8ad1af73ac --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gather_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor gather(const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gather_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gather_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..3eff09091d9093cb254b7f68c2ff42fa39caf47f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gather_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_gather : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4fe0b341091ae1fa0cd1292458e411b2b9da9436 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor gcd(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & gcd_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & gcd_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & gcd_(at::Tensor & self, const at::Tensor & other); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ge_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ge_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..06ff45e596f489f0d344b239a01be0aa6f4f5808 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ge_meta.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_ge_Scalar : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Scalar & other); +}; +struct TORCH_API structured_ge_Tensor : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..d05a76c3b5c04dd8fc85eabf11f6108441738663 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & gelu_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") { + return at::_ops::gelu_backward_grad_input::call(grad_output, self, approximate, grad_input); +} +// aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & gelu_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate, at::Tensor & grad_input) { + return at::_ops::gelu_backward_grad_input::call(grad_output, self, approximate, grad_input); +} + +// aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor +inline at::Tensor gelu_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") { + return at::_ops::gelu_backward::call(grad_output, self, approximate); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f6139ff2355fc9f04afb4810ee57de5ab542cae1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor gelu(const at::Tensor & self, c10::string_view approximate="none"); +TORCH_API at::Tensor & gelu_out(at::Tensor & out, const at::Tensor & self, c10::string_view approximate="none"); +TORCH_API at::Tensor & gelu_outf(const at::Tensor & self, c10::string_view approximate, at::Tensor & out); +TORCH_API at::Tensor & gelu_(at::Tensor & self, c10::string_view approximate="none"); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f6c6b347793670678fd73a7d6a1b6623465d714f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor gelu(const at::Tensor & self, c10::string_view approximate="none"); +TORCH_API at::Tensor & gelu_out(at::Tensor & out, const at::Tensor & self, c10::string_view approximate="none"); +TORCH_API at::Tensor & gelu_outf(const at::Tensor & self, c10::string_view approximate, at::Tensor & out); +TORCH_API at::Tensor & gelu_(at::Tensor & self, c10::string_view approximate="none"); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/glu_backward_jvp.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/glu_backward_jvp.h new file mode 100644 index 0000000000000000000000000000000000000000..70862c6182ef346df544f28c5446df64105bc1ad --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/glu_backward_jvp.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor +inline at::Tensor glu_backward_jvp(const at::Tensor & grad_x, const at::Tensor & grad_glu, const at::Tensor & x, const at::Tensor & dgrad_glu, const at::Tensor & dx, int64_t dim) { + return at::_ops::glu_backward_jvp::call(grad_x, grad_glu, x, dgrad_glu, dx, dim); +} + +// aten::glu_backward_jvp.out(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & glu_backward_jvp_out(at::Tensor & out, const at::Tensor & grad_x, const at::Tensor & grad_glu, const at::Tensor & x, const at::Tensor & dgrad_glu, const at::Tensor & dx, int64_t dim) { + return at::_ops::glu_backward_jvp_out::call(grad_x, grad_glu, x, dgrad_glu, dx, dim, out); +} +// aten::glu_backward_jvp.out(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & glu_backward_jvp_outf(const at::Tensor & grad_x, const at::Tensor & grad_glu, const at::Tensor & x, const at::Tensor & dgrad_glu, const at::Tensor & dx, int64_t dim, at::Tensor & out) { + return at::_ops::glu_backward_jvp_out::call(grad_x, grad_glu, x, dgrad_glu, dx, dim, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/glu_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/glu_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..dd74bb73e29bf7b051c2bb30b7cf4531c0bc571b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/glu_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor glu(const at::Tensor & self, int64_t dim=-1); +TORCH_API at::Tensor & glu_out(at::Tensor & out, const at::Tensor & self, int64_t dim=-1); +TORCH_API at::Tensor & glu_outf(const at::Tensor & self, int64_t dim, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/glu_jvp_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/glu_jvp_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9a9f41651f87ea1220457197a400d69fe5e53bbd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/glu_jvp_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & glu_jvp_out(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor glu_jvp(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/greater_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/greater_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..b1c4de7dfcee9f1485c9250491919bd048ec7110 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/greater_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API greater_Scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater"; + static constexpr const char* overload_name = "Scalar_out"; + static constexpr const char* schema_str = "greater.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +}; + +struct TORCH_API greater_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "greater.Scalar(Tensor self, Scalar other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API greater_Tensor_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater"; + static constexpr const char* overload_name = "Tensor_out"; + static constexpr const char* schema_str = "greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API greater_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "greater.Tensor(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API greater__Scalar { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API greater__Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::greater_"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1ffda6d63ff0543a0c33801f1163e44be2f0d70e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple grid_sampler_2d_backward_out(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); +TORCH_API ::std::tuple grid_sampler_2d_backward_cpu(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); +TORCH_API ::std::tuple grid_sampler_2d_backward_cuda(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..a737519c3ff3a27f5511b2882871fd70c0979561 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & grid_sampler_2d_out(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, at::Tensor & out); +TORCH_API at::Tensor grid_sampler_2d_cpu(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); +TORCH_API at::Tensor grid_sampler_2d_cuda(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_3d_backward_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_3d_backward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6e416e7b0be2c0ee4edf360d857267c79c491df3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_3d_backward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple grid_sampler_3d_backward_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); +TORCH_API ::std::tuple grid_sampler_3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b6432269a09d1def017226ac14fd0739c05aa144 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor grid_sampler(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hann_window_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hann_window_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a74133b6c9e77f4db826eda47080feb6c1076c7c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hann_window_compositeexplicitautograd_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor hann_window(int64_t window_length, at::TensorOptions options={}); +TORCH_API at::Tensor hann_window(int64_t window_length, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor & hann_window_out(at::Tensor & out, int64_t window_length); +TORCH_API at::Tensor & hann_window_outf(int64_t window_length, at::Tensor & out); +TORCH_API at::Tensor hann_window(int64_t window_length, bool periodic, at::TensorOptions options={}); +TORCH_API at::Tensor hann_window(int64_t window_length, bool periodic, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor & hann_window_out(at::Tensor & out, int64_t window_length, bool periodic); +TORCH_API at::Tensor & hann_window_outf(int64_t window_length, bool periodic, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardshrink_backward_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardshrink_backward_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6dc597d85aa87e01bfd55aae23e4414d44d6ca24 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardshrink_backward_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor hardshrink_backward(const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd); +TORCH_API at::Tensor & hardshrink_backward_out(at::Tensor & grad_input, const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd); +TORCH_API at::Tensor & hardshrink_backward_outf(const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd, at::Tensor & grad_input); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardshrink_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardshrink_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..645bc0425c92b658e7a8b5e89b7ccb5d060e7c0b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardshrink_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_hardshrink_backward_out : public at::meta::structured_hardshrink_backward { +void impl(const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd, const at::Tensor & grad_input); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..37d273dc8cdbac5af6dc0b8c9c531597028d1e9d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & hardsigmoid_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::hardsigmoid_backward_grad_input::call(grad_output, self, grad_input); +} +// aten::hardsigmoid_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & hardsigmoid_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) { + return at::_ops::hardsigmoid_backward_grad_input::call(grad_output, self, grad_input); +} + +// aten::hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor +inline at::Tensor hardsigmoid_backward(const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::hardsigmoid_backward::call(grad_output, self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a52f8a80932e8b70c47e75c5bac73971a385bc7e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor hardswish_backward(const at::Tensor & grad_output, const at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..892ece6f84ae8ca9f80e528372bb932cbfdb66c7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor hardswish(const at::Tensor & self); +TORCH_API at::Tensor & hardswish_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & hardswish_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & hardswish_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..a7709ef73430311dc50a236a864e7c249821f600 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & hardtanh_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val) { + return at::_ops::hardtanh_backward_grad_input::call(grad_output, self, min_val, max_val, grad_input); +} +// aten::hardtanh_backward.grad_input(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & hardtanh_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val, at::Tensor & grad_input) { + return at::_ops::hardtanh_backward_grad_input::call(grad_output, self, min_val, max_val, grad_input); +} + +// aten::hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor +inline at::Tensor hardtanh_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val) { + return at::_ops::hardtanh_backward::call(grad_output, self, min_val, max_val); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c4d116ca839f93dbcbca6193c45de26de95c21d8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor hardtanh_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val); +TORCH_API at::Tensor & hardtanh_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val); +TORCH_API at::Tensor & hardtanh_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val, at::Tensor & grad_input); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f0cdc655726ab4e6f6fd107752870b2697951674 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor hardtanh_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val); +TORCH_API at::Tensor & hardtanh_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val); +TORCH_API at::Tensor & hardtanh_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..144e59c679abdd10096eb55ce08399d902953a4d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor hash_tensor(const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false, int64_t mode=0); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..873b9fac9768964a69f78507bf95a35e731fa47c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_hash_tensor_out : public at::meta::structured_hash_tensor { +void impl(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, int64_t mode, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hsplit_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hsplit_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2a209e5d64c4e3afb8afe164dd2f6e2e2dc87292 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/hsplit_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API hsplit_int { + using schema = ::std::vector (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::hsplit"; + static constexpr const char* overload_name = "int"; + static constexpr const char* schema_str = "hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]"; + static ::std::vector call(const at::Tensor & self, int64_t sections); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sections); +}; + +struct TORCH_API hsplit_array { + using schema = ::std::vector (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::hsplit"; + static constexpr const char* overload_name = "array"; + static constexpr const char* schema_str = "hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]"; + static ::std::vector call(const at::Tensor & self, at::IntArrayRef indices); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef indices); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/im2col_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/im2col_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4f2255e5e4b9d7c19dde5af1e90dab15048e4001 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/im2col_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor im2col(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); +TORCH_API at::Tensor & im2col_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); +TORCH_API at::Tensor & im2col_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8e3b767548c213164d1d4da355fd35570ee11146 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor index_add(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1); +TORCH_API at::Tensor & index_add_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1); +TORCH_API at::Tensor & index_add_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & index_add_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..e94757084730f1d57cf366f293fcd0d40a91c365 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_meta.h @@ -0,0 +1,44 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_index_add : public at::impl::MetaBase { + + template + struct TORCH_API precompute_out { + + precompute_out set_dim(int64_t value) { + static_assert(DIM == false, "dim already set"); + precompute_out ret; +ret.dim = value; +return ret; + } + + int64_t dim; + }; + using meta_return_ty = precompute_out ; + meta_return_ty meta(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/indices_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/indices_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0022a75353d1822f26d4f6d4b7ee2d6e14949aa2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/indices_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor indices(const at::Tensor & self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/infinitely_differentiable_gelu_backward_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/infinitely_differentiable_gelu_backward_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b24561cf68aa63638f9835d85fafa8229255e4e5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/infinitely_differentiable_gelu_backward_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor infinitely_differentiable_gelu_backward(const at::Tensor & grad, const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/instance_norm.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/instance_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..a5cd9056522b1cd97c04df9979e441d014ba1b92 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/instance_norm.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::instance_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool use_input_stats, float momentum, float eps, bool cudnn_enabled) -> Tensor +inline at::Tensor instance_norm(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, bool use_input_stats, double momentum, double eps, bool cudnn_enabled) { + return at::_ops::instance_norm::call(input, weight, bias, running_mean, running_var, use_input_stats, momentum, eps, cudnn_enabled); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/inverse_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/inverse_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0c2790b1388072cc79512fd3a2fc204963f3226d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/inverse_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor inverse(const at::Tensor & self); +TORCH_API at::Tensor & inverse_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & inverse_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0e6ca3c2824ed95472376cc405c81c6a26590954 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API bool is_inference(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8759c7f6b87405f3bcc8595581e327f84539d9b0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API is_inference { + using schema = bool (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::is_inference"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "is_inference(Tensor self) -> bool"; + static bool call(const at::Tensor & self); + static bool redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_leaf_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_leaf_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1bcb22e66f57d9c718f1a05947e519884c222148 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_leaf_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API is_leaf { + using schema = bool (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::is_leaf"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "is_leaf(Tensor self) -> bool"; + static bool call(const at::Tensor & self); + static bool redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_neg_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_neg_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..140ab526c5a773e16579cb29b91e7b765c96c156 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/is_neg_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API bool is_neg(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isfinite_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isfinite_native.h new file mode 100644 index 0000000000000000000000000000000000000000..318a706b2b09cc58386b9a950b7fbbdf2a1ba194 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isfinite_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor isfinite(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isin_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isin_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e0b8bb6b7509189ba7cfeee57ae615b76a43352d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isin_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_isin_Tensor_Tensor_out : public at::meta::structured_isin_Tensor_Tensor { +void impl(const at::Tensor & elements, const at::Tensor & test_elements, bool assume_unique, bool invert, const at::Tensor & out); +}; +struct TORCH_API structured_isin_Tensor_Scalar_out : public at::meta::structured_isin_Tensor_Scalar { +void impl(const at::Tensor & elements, const at::Scalar & test_element, bool assume_unique, bool invert, const at::Tensor & out); +}; +struct TORCH_API structured_isin_Scalar_Tensor_out : public at::meta::structured_isin_Scalar_Tensor { +void impl(const at::Scalar & element, const at::Tensor & test_elements, bool assume_unique, bool invert, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isnan_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isnan_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e4559a5168088fc04e8d2d5cd151a393408e288c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isnan_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor isnan(const at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isnan_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isnan_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..02f17f66c851dc2070e21109644f831eff134f27 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isnan_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API isnan { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::isnan"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "isnan(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API isnan_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::isnan"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "isnan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c15158f09213113bed4a3d760ec562413210f4b2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor isneginf(const at::Tensor & self); +TORCH_API at::Tensor & isneginf_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & isneginf_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ee502e46a50bf9eba4efebd7bcf2d78064bf5858 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API isneginf { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::isneginf"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "isneginf(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API isneginf_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::isneginf"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isreal_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isreal_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..14951b5ccbe251aa7c99244b6a7d89c8fe3d7e10 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/isreal_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor isreal(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/l1_loss_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/l1_loss_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7d62afce6896ccb4e354e8f04f5d91f5c34791fe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/l1_loss_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor l1_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ldexp_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ldexp_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..40914e8661c887f8a5f8dab0104efb5d2ead776f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ldexp_compositeimplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor ldexp(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & ldexp_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & ldexp_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & ldexp_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6a2d604df85df4dfdea2c169a76a7d067dd0a18e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_cuda_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor le(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & le_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & le_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +TORCH_API at::Tensor & le_(at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor le(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & le_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & le_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & le_(at::Tensor & self, const at::Tensor & other); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..06ddf8a83cc953355e7852d41f87f5c36b4a29c3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_meta.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_le_Scalar : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Scalar & other); +}; +struct TORCH_API structured_le_Tensor : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..abf361c208c9a56c339c3bb8f6a3b6f1bd23e3eb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_meta_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor le(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & le_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & le_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +TORCH_API at::Tensor & le_(at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor le(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & le_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & le_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & le_(at::Tensor & self, const at::Tensor & other); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e9bdccfdd8a305448ced0e21165ecc054110d857 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/le_native.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_le_Scalar_out : public at::meta::structured_le_Scalar { +void impl(const at::Tensor & self, const at::Scalar & other, const at::Tensor & out); +}; +TORCH_API at::Tensor le_quantized_cpu(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & le_out_quantized_cpu(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +struct TORCH_API structured_le_Tensor_out : public at::meta::structured_le_Tensor { +void impl(const at::Tensor & self, const at::Tensor & other, const at::Tensor & out); +}; +TORCH_API at::Tensor le_quantized_cpu(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & le_out_quantized_cpu(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..615472abf4f06e950af709e27510fcd90a2d2471 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor leaky_relu(const at::Tensor & self, const at::Scalar & negative_slope=0.01); +TORCH_API at::Tensor & leaky_relu_(at::Tensor & self, const at::Scalar & negative_slope=0.01); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/less_equal_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/less_equal_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..609c300aee040daea066866817d71e990bdce7db --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/less_equal_compositeimplicitautograd_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor less_equal(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & less_equal_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & less_equal_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +TORCH_API at::Tensor & less_equal_(at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor less_equal(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & less_equal_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & less_equal_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & less_equal_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..25429a732d69d19811a4a8cdf0fa104ff4fe5ddf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/lift_fresh_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & lift_fresh_copy_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & lift_fresh_copy_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c559fdf200341af4c473aaa9399a78de2addd9bb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple linalg_cholesky_ex(const at::Tensor & self, bool upper=false, bool check_errors=false); +TORCH_API ::std::tuple linalg_cholesky_ex_out(at::Tensor & L, at::Tensor & info, const at::Tensor & self, bool upper=false, bool check_errors=false); +TORCH_API ::std::tuple linalg_cholesky_ex_outf(const at::Tensor & self, bool upper, bool check_errors, at::Tensor & L, at::Tensor & info); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cfe744fccf8180c4aab90e5d164efff1a9c08999 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API ::std::tuple linalg_cholesky_ex(const at::Tensor & self, bool upper=false, bool check_errors=false); +TORCH_API ::std::tuple linalg_cholesky_ex_out(at::Tensor & L, at::Tensor & info, const at::Tensor & self, bool upper=false, bool check_errors=false); +TORCH_API ::std::tuple linalg_cholesky_ex_outf(const at::Tensor & self, bool upper, bool check_errors, at::Tensor & L, at::Tensor & info); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..4f3368dff66b454c7fdfa868b723e9fc6add9fbc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_linalg_cross : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, const at::Tensor & other, int64_t dim); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eig_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eig_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7ecfc5673ab65e45818701bca883081416f2d526 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eig_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linalg_eig { + using schema = ::std::tuple (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_eig"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors)"; + static ::std::tuple call(const at::Tensor & self); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API linalg_eig_out { + using schema = ::std::tuple (const at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_eig"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)"; + static ::std::tuple call(const at::Tensor & self, at::Tensor & eigenvalues, at::Tensor & eigenvectors); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & eigenvalues, at::Tensor & eigenvectors); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh.h new file mode 100644 index 0000000000000000000000000000000000000000..ba5bcb116dd51221203fcc67e73b135aa83768e5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors) +inline ::std::tuple linalg_eigh(const at::Tensor & self, c10::string_view UPLO="L") { + return at::_ops::linalg_eigh::call(self, UPLO); +} + +// aten::linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) +inline ::std::tuple linalg_eigh_out(at::Tensor & eigvals, at::Tensor & eigvecs, const at::Tensor & self, c10::string_view UPLO="L") { + return at::_ops::linalg_eigh_eigvals::call(self, UPLO, eigvals, eigvecs); +} +// aten::linalg_eigh.eigvals(Tensor self, str UPLO="L", *, Tensor(a!) eigvals, Tensor(b!) eigvecs) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) +inline ::std::tuple linalg_eigh_outf(const at::Tensor & self, c10::string_view UPLO, at::Tensor & eigvals, at::Tensor & eigvecs) { + return at::_ops::linalg_eigh_eigvals::call(self, UPLO, eigvals, eigvecs); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0755332f4fc6cadebbdf61638619584d603f7400 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple linalg_eigh(const at::Tensor & self, c10::string_view UPLO="L"); +TORCH_API ::std::tuple linalg_eigh_out(const at::Tensor & self, c10::string_view UPLO, at::Tensor & eigvals, at::Tensor & eigvecs); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_householder_product.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_householder_product.h new file mode 100644 index 0000000000000000000000000000000000000000..2ffe202c65fe4b2bc0b5ddbcf610fda9d6f5d76f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_householder_product.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_householder_product(Tensor input, Tensor tau) -> Tensor +inline at::Tensor linalg_householder_product(const at::Tensor & input, const at::Tensor & tau) { + return at::_ops::linalg_householder_product::call(input, tau); +} + +// aten::linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_householder_product_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & tau) { + return at::_ops::linalg_householder_product_out::call(input, tau, out); +} +// aten::linalg_householder_product.out(Tensor input, Tensor tau, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_householder_product_outf(const at::Tensor & input, const at::Tensor & tau, at::Tensor & out) { + return at::_ops::linalg_householder_product_out::call(input, tau, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_householder_product_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_householder_product_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d830b2033bc422b77c72b8ea7900c08ee48a80c4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_householder_product_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor linalg_householder_product(const at::Tensor & input, const at::Tensor & tau); +TORCH_API at::Tensor & linalg_householder_product_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & tau); +TORCH_API at::Tensor & linalg_householder_product_outf(const at::Tensor & input, const at::Tensor & tau, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ex.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ex.h new file mode 100644 index 0000000000000000000000000000000000000000..d2cacef7c42eabe4b91aa5491259968dff797c56 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ex.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info) +inline ::std::tuple linalg_inv_ex(const at::Tensor & A, bool check_errors=false) { + return at::_ops::linalg_inv_ex::call(A, check_errors); +} + +// aten::linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info) +inline ::std::tuple linalg_inv_ex_out(at::Tensor & inverse, at::Tensor & info, const at::Tensor & A, bool check_errors=false) { + return at::_ops::linalg_inv_ex_inverse::call(A, check_errors, inverse, info); +} +// aten::linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info) +inline ::std::tuple linalg_inv_ex_outf(const at::Tensor & A, bool check_errors, at::Tensor & inverse, at::Tensor & info) { + return at::_ops::linalg_inv_ex_inverse::call(A, check_errors, inverse, info); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ex_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ex_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..bfc506f91107ddbb10d6c92a896a9901034bfe43 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ex_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API ::std::tuple linalg_inv_ex(const at::Tensor & A, bool check_errors=false); +TORCH_API ::std::tuple linalg_inv_ex_out(at::Tensor & inverse, at::Tensor & info, const at::Tensor & A, bool check_errors=false); +TORCH_API ::std::tuple linalg_inv_ex_outf(const at::Tensor & A, bool check_errors, at::Tensor & inverse, at::Tensor & info); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_factor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_factor.h new file mode 100644 index 0000000000000000000000000000000000000000..3a15412312fe0d962a841853dfc4eda8f4ac3bb4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_factor.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_ldl_factor(Tensor self, *, bool hermitian=False) -> (Tensor LD, Tensor pivots) +inline ::std::tuple linalg_ldl_factor(const at::Tensor & self, bool hermitian=false) { + return at::_ops::linalg_ldl_factor::call(self, hermitian); +} + +// aten::linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots) +inline ::std::tuple linalg_ldl_factor_out(at::Tensor & LD, at::Tensor & pivots, const at::Tensor & self, bool hermitian=false) { + return at::_ops::linalg_ldl_factor_out::call(self, hermitian, LD, pivots); +} +// aten::linalg_ldl_factor.out(Tensor self, *, bool hermitian=False, Tensor(a!) LD, Tensor(b!) pivots) -> (Tensor(a!) LD, Tensor(b!) pivots) +inline ::std::tuple linalg_ldl_factor_outf(const at::Tensor & self, bool hermitian, at::Tensor & LD, at::Tensor & pivots) { + return at::_ops::linalg_ldl_factor_out::call(self, hermitian, LD, pivots); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lstsq_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lstsq_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..27175e99c844789e6645d7d902c7673e841cda59 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lstsq_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple linalg_lstsq_out(at::Tensor & solution, at::Tensor & residuals, at::Tensor & rank, at::Tensor & singular_values, const at::Tensor & self, const at::Tensor & b, ::std::optional rcond=::std::nullopt, ::std::optional driver=::std::nullopt); +TORCH_API ::std::tuple linalg_lstsq_outf(const at::Tensor & self, const at::Tensor & b, ::std::optional rcond, ::std::optional driver, at::Tensor & solution, at::Tensor & residuals, at::Tensor & rank, at::Tensor & singular_values); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ff7327a8050209d72309f1e39859b7b7cf330fdd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple linalg_lu_factor(const at::Tensor & A, bool pivot=true); +TORCH_API ::std::tuple linalg_lu_factor_out(const at::Tensor & A, bool pivot, at::Tensor & LU, at::Tensor & pivots); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_solve_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_solve_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..38a55b08e3d38aee1d1171f790092d0f5196b67c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_solve_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor linalg_lu_solve(const at::Tensor & LU, const at::Tensor & pivots, const at::Tensor & B, bool left=true, bool adjoint=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_matrix_exp_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_matrix_exp_native.h new file mode 100644 index 0000000000000000000000000000000000000000..dc562d898c9bef25c41b7241368a0c46bc226c4e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_matrix_exp_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & linalg_matrix_exp_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor linalg_matrix_exp(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_matrix_rank_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_matrix_rank_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1329f399e5c8bd04ea0663e3f0372ae3f40397da --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_matrix_rank_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor linalg_matrix_rank(const at::Tensor & input, const ::std::optional & atol={}, const ::std::optional & rtol={}, bool hermitian=false); +TORCH_API at::Tensor & linalg_matrix_rank_out(const at::Tensor & input, const ::std::optional & atol, const ::std::optional & rtol, bool hermitian, at::Tensor & out); +TORCH_API at::Tensor linalg_matrix_rank(const at::Tensor & self, ::std::optional atol=::std::nullopt, ::std::optional rtol=::std::nullopt, bool hermitian=false); +TORCH_API at::Tensor & linalg_matrix_rank_out(const at::Tensor & self, ::std::optional atol, ::std::optional rtol, bool hermitian, at::Tensor & out); +TORCH_API at::Tensor linalg_matrix_rank(const at::Tensor & self, double tol, bool hermitian=false); +TORCH_API at::Tensor & linalg_matrix_rank_out(const at::Tensor & self, double tol, bool hermitian, at::Tensor & out); +TORCH_API at::Tensor linalg_matrix_rank(const at::Tensor & input, const at::Tensor & tol, bool hermitian=false); +TORCH_API at::Tensor & linalg_matrix_rank_out(const at::Tensor & input, const at::Tensor & tol, bool hermitian, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_multi_dot.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_multi_dot.h new file mode 100644 index 0000000000000000000000000000000000000000..85f62a7c75b1076e23ef9fd3c2c147bd1675fdf5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_multi_dot.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_multi_dot(Tensor[] tensors) -> Tensor +inline at::Tensor linalg_multi_dot(at::TensorList tensors) { + return at::_ops::linalg_multi_dot::call(tensors); +} + +// aten::linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_multi_dot_out(at::Tensor & out, at::TensorList tensors) { + return at::_ops::linalg_multi_dot_out::call(tensors, out); +} +// aten::linalg_multi_dot.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_multi_dot_outf(at::TensorList tensors, at::Tensor & out) { + return at::_ops::linalg_multi_dot_out::call(tensors, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f9464d45a7751087c8c387341fcdd9cd3128445a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_pinv_compositeimplicitautograd_dispatch.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor linalg_pinv(const at::Tensor & self, ::std::optional atol, ::std::optional rtol, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_out(at::Tensor & out, const at::Tensor & self, ::std::optional atol, ::std::optional rtol, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_outf(const at::Tensor & self, ::std::optional atol, ::std::optional rtol, bool hermitian, at::Tensor & out); +TORCH_API at::Tensor linalg_pinv(const at::Tensor & self, double rcond, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_out(at::Tensor & out, const at::Tensor & self, double rcond, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_outf(const at::Tensor & self, double rcond, bool hermitian, at::Tensor & out); +TORCH_API at::Tensor linalg_pinv(const at::Tensor & self, const at::Tensor & rcond, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & rcond, bool hermitian=false); +TORCH_API at::Tensor & linalg_pinv_outf(const at::Tensor & self, const at::Tensor & rcond, bool hermitian, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_qr.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_qr.h new file mode 100644 index 0000000000000000000000000000000000000000..60721caa16c157f4f6a02ca6ae043cb3f63015ca --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_qr.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R) +inline ::std::tuple linalg_qr(const at::Tensor & A, c10::string_view mode="reduced") { + return at::_ops::linalg_qr::call(A, mode); +} + +// aten::linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) +inline ::std::tuple linalg_qr_out(at::Tensor & Q, at::Tensor & R, const at::Tensor & A, c10::string_view mode="reduced") { + return at::_ops::linalg_qr_out::call(A, mode, Q, R); +} +// aten::linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R) +inline ::std::tuple linalg_qr_outf(const at::Tensor & A, c10::string_view mode, at::Tensor & Q, at::Tensor & R) { + return at::_ops::linalg_qr_out::call(A, mode, Q, R); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_svd_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_svd_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..953b11b62d2e2c5b83ff531dfaf811ee7235627b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_svd_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API ::std::tuple linalg_svd(const at::Tensor & A, bool full_matrices=true, ::std::optional driver=::std::nullopt); +TORCH_API ::std::tuple linalg_svd_out(at::Tensor & U, at::Tensor & S, at::Tensor & Vh, const at::Tensor & A, bool full_matrices=true, ::std::optional driver=::std::nullopt); +TORCH_API ::std::tuple linalg_svd_outf(const at::Tensor & A, bool full_matrices, ::std::optional driver, at::Tensor & U, at::Tensor & S, at::Tensor & Vh); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_vector_norm.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_vector_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..77c85ebb64dc26118d3fd731e37ba8155a73c06f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_vector_norm.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor linalg_vector_norm(const at::Tensor & self, const at::Scalar & ord=2, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt) { + return at::_ops::linalg_vector_norm::call(self, ord, dim, keepdim, dtype); +} + +// aten::linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_vector_norm_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & ord=2, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt) { + return at::_ops::linalg_vector_norm_out::call(self, ord, dim, keepdim, dtype, out); +} +// aten::linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_vector_norm_outf(const at::Tensor & self, const at::Scalar & ord, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out) { + return at::_ops::linalg_vector_norm_out::call(self, ord, dim, keepdim, dtype, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linear_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linear_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4af6aa47faf6a782527270e967cc00cfcf0ec7d8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linear_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor linear(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}); +TORCH_API at::Tensor & linear_out(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::Tensor & out); +TORCH_API at::Tensor nested_linear(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linear_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linear_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..70ce7e0ad76578be8bec51b69b49f1d078ace17f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linear_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linear { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linear"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias); +}; + +struct TORCH_API linear_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linear"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..04e1219c0356c9f0a729533f219e511dcfe48611 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & linspace_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps); +TORCH_API at::Tensor & linspace_outf(const at::Scalar & start, const at::Scalar & end, int64_t steps, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..38386ff8cfe8af30828a631816ab2c380afb79c4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_ops.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linspace { + using schema = at::Tensor (const at::Scalar &, const at::Scalar &, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(const at::Scalar & start, const at::Scalar & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API linspace_Tensor_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Tensor_Tensor"; + static constexpr const char* schema_str = "linspace.Tensor_Tensor(Tensor start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(const at::Tensor & start, const at::Tensor & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API linspace_Tensor_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Tensor_Scalar"; + static constexpr const char* schema_str = "linspace.Tensor_Scalar(Tensor start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(const at::Tensor & start, const at::Scalar & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API linspace_Scalar_Tensor { + using schema = at::Tensor (const at::Scalar &, const at::Tensor &, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Scalar_Tensor"; + static constexpr const char* schema_str = "linspace.Scalar_Tensor(Scalar start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(const at::Scalar & start, const at::Tensor & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API linspace_out { + using schema = at::Tensor & (const at::Scalar &, const at::Scalar &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Scalar & start, const at::Scalar & end, int64_t steps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, at::Tensor & out); +}; + +struct TORCH_API linspace_Tensor_Tensor_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Tensor_Tensor_out"; + static constexpr const char* schema_str = "linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & start, const at::Tensor & end, int64_t steps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, at::Tensor & out); +}; + +struct TORCH_API linspace_Tensor_Scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Tensor_Scalar_out"; + static constexpr const char* schema_str = "linspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & start, const at::Scalar & end, int64_t steps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, at::Tensor & out); +}; + +struct TORCH_API linspace_Scalar_Tensor_out { + using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Scalar_Tensor_out"; + static constexpr const char* schema_str = "linspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Scalar & start, const at::Tensor & end, int64_t steps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..f3b8ce9c20e90bd2fd7cd33ad1596422f0ab5669 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_log1p : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5d90cb1da6e4c6bba3c7b211270c1b6323e43fe2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor log(const at::Tensor & self); +TORCH_API at::Tensor & log_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & log_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & log_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_normal_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_normal_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..df780a6a1ce45523a72dd9feb3e4e15147d6301b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_normal_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & log_normal_(at::Tensor & self, double mean=1, double std=2, ::std::optional generator=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid.h new file mode 100644 index 0000000000000000000000000000000000000000..8bbe6f9a3fcf089cc13fb1e35bb9adabc0d2df27 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_sigmoid.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & log_sigmoid_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::log_sigmoid_out::call(self, out); +} +// aten::log_sigmoid.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & log_sigmoid_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::log_sigmoid_out::call(self, out); +} + +// aten::log_sigmoid(Tensor self) -> Tensor +inline at::Tensor log_sigmoid(const at::Tensor & self) { + return at::_ops::log_sigmoid::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_softmax.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..f0f4595d6f6cebfce18ef5c4ef1b0aed89ff1032 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/log_softmax.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor +inline at::Tensor log_softmax(const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::log_softmax_int::call(self, dim, dtype); +} + +// aten::log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & log_softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::log_softmax_int_out::call(self, dim, dtype, out); +} +// aten::log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & log_softmax_outf(const at::Tensor & self, int64_t dim, ::std::optional dtype, at::Tensor & out) { + return at::_ops::log_softmax_int_out::call(self, dim, dtype, out); +} + +// aten::log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor log_softmax(const at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::log_softmax_Dimname::call(self, dim, dtype); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a82090c3d80abf60be679baa606476d2580b329d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor logaddexp2(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & logaddexp2_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & logaddexp2_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2def00538d88a286402b51069c77743b37422d92 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_logaddexp2_out : public at::meta::structured_logaddexp2 { +void impl(const at::Tensor & self, const at::Tensor & other, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp.h new file mode 100644 index 0000000000000000000000000000000000000000..38bd065af34fef814349283c1584b66df350449d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp.h @@ -0,0 +1,59 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::logcumsumexp(Tensor self, int dim) -> Tensor +inline at::Tensor logcumsumexp(const at::Tensor & self, int64_t dim) { + return at::_ops::logcumsumexp::call(self, dim); +} + +// aten::logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logcumsumexp_out(at::Tensor & out, const at::Tensor & self, int64_t dim) { + return at::_ops::logcumsumexp_out::call(self, dim, out); +} +// aten::logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logcumsumexp_outf(const at::Tensor & self, int64_t dim, at::Tensor & out) { + return at::_ops::logcumsumexp_out::call(self, dim, out); +} + +// aten::logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor +inline at::Tensor logcumsumexp(const at::Tensor & self, at::Dimname dim) { + return at::_ops::logcumsumexp_dimname::call(self, dim); +} + +// aten::logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logcumsumexp_out(at::Tensor & out, const at::Tensor & self, at::Dimname dim) { + return at::_ops::logcumsumexp_dimname_out::call(self, dim, out); +} +// aten::logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logcumsumexp_outf(const at::Tensor & self, at::Dimname dim, at::Tensor & out) { + return at::_ops::logcumsumexp_dimname_out::call(self, dim, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logical_and_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logical_and_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..29e594c7dfdb46fb0fcd474187b291ff97588823 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logical_and_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor logical_and(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & logical_and_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logit_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logit_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..43492ebe60e9259aedfb618d252393c005f3810e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logit_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_logit_backward_out : public at::meta::structured_logit_backward { +void impl(const at::Tensor & grad_output, const at::Tensor & self, ::std::optional eps, const at::Tensor & grad_input); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logit_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logit_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3730bcb3b0673d0a65036a90891af1b99c2a2a1c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logit_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor logit(const at::Tensor & self, ::std::optional eps=::std::nullopt); +TORCH_API at::Tensor & logit_out(at::Tensor & out, const at::Tensor & self, ::std::optional eps=::std::nullopt); +TORCH_API at::Tensor & logit_outf(const at::Tensor & self, ::std::optional eps, at::Tensor & out); +TORCH_API at::Tensor & logit_(at::Tensor & self, ::std::optional eps=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logspace.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logspace.h new file mode 100644 index 0000000000000000000000000000000000000000..3f5e6583ecc8b91fa3300c50dd00112e002736f7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logspace.h @@ -0,0 +1,103 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0, at::TensorOptions options={}) { + return at::_ops::logspace::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::logspace(Scalar start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::logspace::call(start, end, steps, base, dtype, layout, device, pin_memory); +} + +// aten::logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base=10.0, at::TensorOptions options={}) { + return at::_ops::logspace_Tensor_Tensor::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::logspace.Tensor_Tensor(Tensor start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::logspace_Tensor_Tensor::call(start, end, steps, base, dtype, layout, device, pin_memory); +} + +// aten::logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base=10.0, at::TensorOptions options={}) { + return at::_ops::logspace_Tensor_Scalar::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::logspace.Tensor_Scalar(Tensor start, Scalar end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::logspace_Tensor_Scalar::call(start, end, steps, base, dtype, layout, device, pin_memory); +} + +// aten::logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base=10.0, at::TensorOptions options={}) { + return at::_ops::logspace_Scalar_Tensor::call(start, end, steps, base, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::logspace.Scalar_Tensor(Scalar start, Tensor end, int steps, float base=10.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor logspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::logspace_Scalar_Tensor::call(start, end, steps, base, dtype, layout, device, pin_memory); +} + +// aten::logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0) { + return at::_ops::logspace_out::call(start, end, steps, base, out); +} +// aten::logspace.out(Scalar start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_outf(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out) { + return at::_ops::logspace_out::call(start, end, steps, base, out); +} + +// aten::logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_out(at::Tensor & out, const at::Tensor & start, const at::Tensor & end, int64_t steps, double base=10.0) { + return at::_ops::logspace_Tensor_Tensor_out::call(start, end, steps, base, out); +} +// aten::logspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_outf(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base, at::Tensor & out) { + return at::_ops::logspace_Tensor_Tensor_out::call(start, end, steps, base, out); +} + +// aten::logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_out(at::Tensor & out, const at::Tensor & start, const at::Scalar & end, int64_t steps, double base=10.0) { + return at::_ops::logspace_Tensor_Scalar_out::call(start, end, steps, base, out); +} +// aten::logspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_outf(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out) { + return at::_ops::logspace_Tensor_Scalar_out::call(start, end, steps, base, out); +} + +// aten::logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_out(at::Tensor & out, const at::Scalar & start, const at::Tensor & end, int64_t steps, double base=10.0) { + return at::_ops::logspace_Scalar_Tensor_out::call(start, end, steps, base, out); +} +// aten::logspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, float base=10.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logspace_outf(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base, at::Tensor & out) { + return at::_ops::logspace_Scalar_Tensor_out::call(start, end, steps, base, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logspace_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logspace_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8552951b6e282d4acef105889566d69c98de73dc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logspace_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & logspace_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0); +TORCH_API at::Tensor & logspace_outf(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logsumexp_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logsumexp_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d66a1906385fa3dac13903eba6112e82173bb07a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/logsumexp_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor logsumexp(const at::Tensor & self, at::DimnameList dim, bool keepdim=false); +TORCH_API at::Tensor & logsumexp_out(at::Tensor & out, const at::Tensor & self, at::DimnameList dim, bool keepdim=false); +TORCH_API at::Tensor & logsumexp_outf(const at::Tensor & self, at::DimnameList dim, bool keepdim, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/margin_ranking_loss_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/margin_ranking_loss_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..45bc5281e0935a17baec899f9368aabcbcde03d8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/margin_ranking_loss_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor margin_ranking_loss(const at::Tensor & input1, const at::Tensor & input2, const at::Tensor & target, double margin=0.0, int64_t reduction=at::Reduction::Mean); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/masked_fill_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/masked_fill_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ba6cb70cc3e2a6570e1f4633a1e38ffce5657b43 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/masked_fill_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value); +TORCH_API at::Tensor & masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/masked_fill_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/masked_fill_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3cd90616ed746e0e96ec2c1394ae473c5880ba07 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/masked_fill_native.h @@ -0,0 +1,38 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value); +TORCH_API at::Tensor & masked_fill_Scalar_out(const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value, at::Tensor & out); +TORCH_API at::Tensor & masked_fill__cpu(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value); +TORCH_API at::Tensor & masked_fill__cuda(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value); +TORCH_API at::Tensor NestedTensor_masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value); +TORCH_API at::Tensor & masked_fill__quantized_cpu(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value); +TORCH_API at::Tensor & masked_fill__quantized_cuda(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value); +TORCH_API at::Tensor masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & value); +TORCH_API at::Tensor & masked_fill_Tensor_out(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & value, at::Tensor & out); +TORCH_API at::Tensor & masked_fill__cpu(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value); +TORCH_API at::Tensor & masked_fill__cuda(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value); +TORCH_API at::Tensor & masked_fill__quantized_cpu(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value); +TORCH_API at::Tensor & masked_fill__quantized_cuda(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ee2338a13b4a02ca999367fb5ff8fc1464084b58 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & masked_scatter_(at::Tensor & self, const at::Tensor & mask, const at::Tensor & source); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H.h new file mode 100644 index 0000000000000000000000000000000000000000..11efff31192701882f21d5a26fc10987ee5f572b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fb998b9b729ef80bf11ace1c4b8214d57fba611f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor matrix_H(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_power.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_power.h new file mode 100644 index 0000000000000000000000000000000000000000..37f2f26354b0679784f1dad9f6505f579166a926 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_power.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::matrix_power(Tensor self, int n) -> Tensor +inline at::Tensor matrix_power(const at::Tensor & self, int64_t n) { + return at::_ops::matrix_power::call(self, n); +} + +// aten::matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & matrix_power_out(at::Tensor & out, const at::Tensor & self, int64_t n) { + return at::_ops::matrix_power_out::call(self, n, out); +} +// aten::matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & matrix_power_outf(const at::Tensor & self, int64_t n, at::Tensor & out) { + return at::_ops::matrix_power_out::call(self, n, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/max_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/max_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9553f183d4b7d164df8519e020f8a00182ea69f2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/max_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API ::std::tuple max(const at::Tensor & self, int64_t dim, bool keepdim=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool1d_with_indices.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool1d_with_indices.h new file mode 100644 index 0000000000000000000000000000000000000000..88dc799be41b347b42a4bc8ba49d610649af74d0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool1d_with_indices.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) +inline ::std::tuple max_pool1d_with_indices(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) { + return at::_ops::max_pool1d_with_indices::call(self, kernel_size, stride, padding, dilation, ceil_mode); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5c0f66b147a518c3903e51ba0274ce03fb258ec1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple max_pool2d_with_indices(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false); +TORCH_API ::std::tuple max_pool2d_with_indices_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false); +TORCH_API ::std::tuple max_pool2d_with_indices_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mean_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mean_native.h new file mode 100644 index 0000000000000000000000000000000000000000..fb8b34a3a8527e41133c8e6d5293dde599e8a112 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mean_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +TORCH_API at::Tensor mean(const at::Tensor & self, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & mean_dtype_out(const at::Tensor & self, ::std::optional dtype, at::Tensor & out); +struct TORCH_API structured_mean_out : public at::meta::structured_mean_dim { +void impl(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, const at::Tensor & out); +}; +TORCH_API at::Tensor mean_quantized_cpu(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & mean_out_quantized_cpu(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out); +TORCH_API at::Tensor mean(const at::Tensor & self, at::DimnameList dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & mean_out(const at::Tensor & self, at::DimnameList dim, bool keepdim, ::std::optional dtype, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/median_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/median_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c11ed2cbcfdd794f58454ee98479f3f2e2f05afd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/median_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & median_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor median_cpu(const at::Tensor & self); +TORCH_API at::Tensor median_cuda(const at::Tensor & self); +TORCH_API ::std::tuple median(const at::Tensor & self, int64_t dim, bool keepdim=false); +TORCH_API ::std::tuple median_out_cpu(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +TORCH_API ::std::tuple median_out_cuda(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +TORCH_API ::std::tuple median(const at::Tensor & self, at::Dimname dim, bool keepdim=false); +TORCH_API ::std::tuple median_out(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/median_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/median_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..47b96ce1288dd84f0d74fea75e207fcd160424c3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/median_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API median { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "median(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API median_dim { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "dim"; + static constexpr const char* schema_str = "median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool keepdim); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim); +}; + +struct TORCH_API median_dim_values { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "dim_values"; + static constexpr const char* schema_str = "median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API median_names_dim { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "names_dim"; + static constexpr const char* schema_str = "median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool keepdim); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim); +}; + +struct TORCH_API median_names_dim_values { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "names_dim_values"; + static constexpr const char* schema_str = "median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API median_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "median.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bee78a94be892d967760dfcb280055fc93866e23 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API miopen_convolution { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::miopen_convolution"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic); +}; + +struct TORCH_API miopen_convolution_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, bool, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::miopen_convolution"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "miopen_convolution.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_relu_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_relu_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..137eba207341a54439ca3cf1b18f603b1e339626 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_relu_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API miopen_convolution_relu { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::miopen_convolution_relu"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0ed35c78d273e01fc25cc5619ca2bf6ce9a19df5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor math_mish_backward(const at::Tensor & grad_output, const at::Tensor & self); +TORCH_API at::Tensor mish_backward(const at::Tensor & grad_output, const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..5d1bee44121dcccfdfb99f701c1a1f57a528b88b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor +inline at::Tensor mkldnn_adaptive_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::mkldnn_adaptive_avg_pool2d_backward::call(grad_output, self); +} + +// aten::mkldnn_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_adaptive_avg_pool2d_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::mkldnn_adaptive_avg_pool2d_backward_out::call(grad_output, self, out); +} +// aten::mkldnn_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_adaptive_avg_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out) { + return at::_ops::mkldnn_adaptive_avg_pool2d_backward_out::call(grad_output, self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3125938f98b5cca430916902a0e7f8fee69c2e26 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API mkldnn_adaptive_avg_pool2d_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self); +}; + +struct TORCH_API mkldnn_adaptive_avg_pool2d_backward_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d_backward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4b8399254b2f4ac51ae809f800672a90268a89a7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API mkldnn_adaptive_avg_pool2d { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef output_size); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size); +}; + +struct TORCH_API mkldnn_adaptive_avg_pool2d_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_weights.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_weights.h new file mode 100644 index 0000000000000000000000000000000000000000..ca183b58f8bee439911855a302823bbfea12c013 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_weights.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::mkldnn_linear_backward_weights(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined) -> (Tensor, Tensor) +inline ::std::tuple mkldnn_linear_backward_weights(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined) { + return at::_ops::mkldnn_linear_backward_weights::call(grad_output, input, weight, bias_defined); +} + +// aten::mkldnn_linear_backward_weights.out(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple mkldnn_linear_backward_weights_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined) { + return at::_ops::mkldnn_linear_backward_weights_out::call(grad_output, input, weight, bias_defined, out0, out1); +} +// aten::mkldnn_linear_backward_weights.out(Tensor grad_output, Tensor input, Tensor weight, bool bias_defined, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple mkldnn_linear_backward_weights_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, bool bias_defined, at::Tensor & out0, at::Tensor & out1) { + return at::_ops::mkldnn_linear_backward_weights_out::call(grad_output, input, weight, bias_defined, out0, out1); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_max_pool2d_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_max_pool2d_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..55ba9a65851936d4fd7855811a69319256148996 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_max_pool2d_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & mkldnn_max_pool2d_backward_out(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out); +TORCH_API at::Tensor mkldnn_max_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_max_pool3d_backward_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_max_pool3d_backward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..91fa08b5632f8c67e338ae712a30832b221091f6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_max_pool3d_backward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & mkldnn_max_pool3d_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false); +TORCH_API at::Tensor & mkldnn_max_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_reorder_conv2d_weight.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_reorder_conv2d_weight.h new file mode 100644 index 0000000000000000000000000000000000000000..b89a389c74bc14787a0634b51319d2c919b9cc6e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_reorder_conv2d_weight.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor +inline at::Tensor mkldnn_reorder_conv2d_weight(const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv2d_weight::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt); +} +namespace symint { + template >> + at::Tensor mkldnn_reorder_conv2d_weight(const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv2d_weight::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt); + } +} + +// aten::mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor +inline at::Tensor mkldnn_reorder_conv2d_weight_symint(const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv2d_weight::call(self, padding, stride, dilation, groups, input_size); +} +namespace symint { + template >> + at::Tensor mkldnn_reorder_conv2d_weight(const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv2d_weight::call(self, padding, stride, dilation, groups, input_size); + } +} + +// aten::mkldnn_reorder_conv2d_weight.out(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv2d_weight_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv2d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv2d_weight_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv2d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); + } +} + +// aten::mkldnn_reorder_conv2d_weight.out(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv2d_weight_outf(const at::Tensor & self, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::OptionalIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv2d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv2d_weight_outf(const at::Tensor & self, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::OptionalIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv2d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); + } +} + +// aten::mkldnn_reorder_conv2d_weight.out(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv2d_weight_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv2d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv2d_weight_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv2d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); + } +} + +// aten::mkldnn_reorder_conv2d_weight.out(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv2d_weight_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::OptionalSymIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv2d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv2d_weight_outf(const at::Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::OptionalSymIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv2d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_reorder_conv3d_weight.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_reorder_conv3d_weight.h new file mode 100644 index 0000000000000000000000000000000000000000..a99274afc925dd73a67ed0a0cae58026f2fdc899 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_reorder_conv3d_weight.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor +inline at::Tensor mkldnn_reorder_conv3d_weight(const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt); +} +namespace symint { + template >> + at::Tensor mkldnn_reorder_conv3d_weight(const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt); + } +} + +// aten::mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor +inline at::Tensor mkldnn_reorder_conv3d_weight_symint(const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight::call(self, padding, stride, dilation, groups, input_size); +} +namespace symint { + template >> + at::Tensor mkldnn_reorder_conv3d_weight(const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight::call(self, padding, stride, dilation, groups, input_size); + } +} + +// aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv3d_weight_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv3d_weight_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); + } +} + +// aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv3d_weight_outf(const at::Tensor & self, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::OptionalIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv3d_weight_outf(const at::Tensor & self, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::OptionalIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); + } +} + +// aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv3d_weight_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv3d_weight_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); + } +} + +// aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv3d_weight_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::OptionalSymIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv3d_weight_outf(const at::Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::OptionalSymIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mm_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mm_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6a157f36ff12025a5fff065548f84c11d588ae16 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mm_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor mm(const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & mm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & mm_outf(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mm_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..15dbd2b998da562d987abe092c6d80823a8cc164 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mm_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API mm { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mm"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "mm(Tensor self, Tensor mat2) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat2); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2); +}; + +struct TORCH_API mm_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mm"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); +}; + +struct TORCH_API mm_dtype { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::ScalarType); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mm"; + static constexpr const char* overload_name = "dtype"; + static constexpr const char* schema_str = "mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype); +}; + +struct TORCH_API mm_dtype_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::ScalarType, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mm"; + static constexpr const char* overload_name = "dtype_out"; + static constexpr const char* schema_str = "mm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mode_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mode_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f7456dd2b16ca70de364ac102a53371b9b06ffd1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mode_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple mode(const at::Tensor & self, int64_t dim=-1, bool keepdim=false); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_backward_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_backward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c10510d0153f19bd7629bbf2db70737f3f8e985b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_backward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple mps_convolution_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, ::std::array output_mask); +TORCH_API ::std::tuple mps_convolution_backward_outf(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); +TORCH_API ::std::tuple mps_convolution_backward_symint_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array output_mask); +TORCH_API ::std::tuple mps_convolution_backward_symint_outf(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_transpose_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_transpose_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0f2860569a561e95875cdd8a62ac606b60514c8a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mps_convolution_transpose_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple mps_convolution_transpose_backward_out_symint(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_backward_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f9856cf26e8a926b628df7fac2163d441b8d97e9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_backward_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor mse_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction); +TORCH_API at::Tensor & mse_loss_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction); +TORCH_API at::Tensor & mse_loss_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9ca90b822f06ef50c53bf657bae743ac0f923c89 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor mse_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean); +TORCH_API at::Tensor & mse_loss_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean); +TORCH_API at::Tensor & mse_loss_outf(const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..632e2d27322b50735d1585f548a70de09e3349e6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & mvlgamma_out(at::Tensor & out, const at::Tensor & self, int64_t p); +TORCH_API at::Tensor & mvlgamma_outf(const at::Tensor & self, int64_t p, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b9c6ef27fe6bd312836fcc317ef427e505464ba5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor mvlgamma(const at::Tensor & self, int64_t p); +TORCH_API at::Tensor & mvlgamma_(at::Tensor & self, int64_t p); +TORCH_API at::Tensor & mvlgamma_out(const at::Tensor & self, int64_t p, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nan_to_num_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nan_to_num_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f9138eb842b6cfb380143fff9ee40b9a4a5f28d5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nan_to_num_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & nan_to_num_out(at::Tensor & out, const at::Tensor & self, ::std::optional nan=::std::nullopt, ::std::optional posinf=::std::nullopt, ::std::optional neginf=::std::nullopt); +TORCH_API at::Tensor & nan_to_num_outf(const at::Tensor & self, ::std::optional nan, ::std::optional posinf, ::std::optional neginf, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nanmean_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nanmean_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d462705129f912b3f5b0f85fe61dcde72386f3cf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nanmean_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor nanmean(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & nanmean_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & nanmean_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nansum.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nansum.h new file mode 100644 index 0000000000000000000000000000000000000000..899f24cc1bb55429941782889bc8ca07ff36dc5d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nansum.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor nansum(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt) { + return at::_ops::nansum::call(self, dim, keepdim, dtype); +} + +// aten::nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & nansum_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt) { + return at::_ops::nansum_out::call(self, dim, keepdim, dtype, out); +} +// aten::nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & nansum_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out) { + return at::_ops::nansum_out::call(self, dim, keepdim, dtype, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nansum_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nansum_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e4873dd71df73b8cb90e4bbebbb44670fb1337ad --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nansum_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor nansum(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & nansum_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & nansum_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..7e9e4f4df60d5b8e748897094422364f1d7efe56 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor +inline at::Tensor narrow_copy(const at::Tensor & self, int64_t dim, int64_t start, int64_t length) { + return at::_ops::narrow_copy::call(self, dim, start, length); +} +namespace symint { + template >> + at::Tensor narrow_copy(const at::Tensor & self, int64_t dim, int64_t start, int64_t length) { + return at::_ops::narrow_copy::call(self, dim, start, length); + } +} + +// aten::narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor +inline at::Tensor narrow_copy_symint(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) { + return at::_ops::narrow_copy::call(self, dim, start, length); +} +namespace symint { + template >> + at::Tensor narrow_copy(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) { + return at::_ops::narrow_copy::call(self, dim, start, length); + } +} + +// aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & narrow_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dim, int64_t start, int64_t length) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); +} +namespace symint { + template >> + at::Tensor & narrow_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dim, int64_t start, int64_t length) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); + } +} + +// aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & narrow_copy_outf(const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); +} +namespace symint { + template >> + at::Tensor & narrow_copy_outf(const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); + } +} + +// aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & narrow_copy_symint_out(at::Tensor & out, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); +} +namespace symint { + template >> + at::Tensor & narrow_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); + } +} + +// aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & narrow_copy_symint_outf(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); +} +namespace symint { + template >> + at::Tensor & narrow_copy_outf(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2b570ae75a2edeb57b015585befd97123fdabb7d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API narrow_copy { + using schema = at::Tensor (const at::Tensor &, int64_t, c10::SymInt, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::narrow_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length); +}; + +struct TORCH_API narrow_copy_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, c10::SymInt, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::narrow_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a64c03d85e02f3a345540f14983ce480bf669853 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple native_batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & running_mean, const ::std::optional & running_var, const ::std::optional & save_mean, const ::std::optional & save_invstd, bool train, double eps, ::std::array output_mask); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ne_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ne_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..1cc60b9ff6c7436190751fd930db32a966d620f3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ne_meta.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_ne_Scalar : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Scalar & other); +}; +struct TORCH_API structured_ne_Tensor : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/negative.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/negative.h new file mode 100644 index 0000000000000000000000000000000000000000..58e3c52b13fbc7aebd735891a00f872779d14e83 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/negative.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::negative(Tensor self) -> Tensor +inline at::Tensor negative(const at::Tensor & self) { + return at::_ops::negative::call(self); +} + +// aten::negative_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & negative_(at::Tensor & self) { + return at::_ops::negative_::call(self); +} + +// aten::negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & negative_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::negative_out::call(self, out); +} +// aten::negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & negative_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::negative_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..91a196bf8d27f38cc3e0f678137ad7cfb3b62596 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API nested_to_padded_tensor { + using schema = at::Tensor (const at::Tensor &, double, at::OptionalIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nested_to_padded_tensor"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/new_empty_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/new_empty_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0104bb07aad2a66588ad9195b78e1e3b55a588bf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/new_empty_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor new_empty_symint(const at::Tensor & self, c10::SymIntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & new_empty_out_symint(const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/new_full_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/new_full_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9a563f2c91b7e7623cb8349f08a1370bd3e16061 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/new_full_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor new_full(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & new_full_out_symint(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/new_ones_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/new_ones_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7f66e6a943905c0f75401bdf7e84593fae5fcd70 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/new_ones_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor new_ones(const at::Tensor & self, at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & new_ones_out_symint(const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nextafter_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nextafter_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..91bbdf872ab3ff8d0b084fecff11e2ae7b5bc56c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nextafter_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor nextafter(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & nextafter_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & nextafter_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & nextafter_(at::Tensor & self, const at::Tensor & other); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_forward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_forward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..5b935cdac5561593d0f6be7f94289e570db09d8f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_forward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API nll_loss2d_forward_output { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss2d_forward"; + static constexpr const char* overload_name = "output"; + static constexpr const char* schema_str = "nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight); +}; + +struct TORCH_API nll_loss2d_forward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss2d_forward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)"; + static ::std::tuple call(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/norm_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/norm_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a9aab1309a9a344841d92f6a7aa729424535f13c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/norm_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor norm(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype); +TORCH_API at::Tensor & norm_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype); +TORCH_API at::Tensor & norm_outf(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype, at::Tensor & out); +TORCH_API at::Tensor norm(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim=false); +TORCH_API at::Tensor & norm_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim=false); +TORCH_API at::Tensor & norm_outf(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/norm_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/norm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..59557b984cf908ce1c66d6420a6a1cb54c3546d9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/norm_native.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +TORCH_API at::Tensor norm(const at::Tensor & self, const ::std::optional & p, at::ScalarType dtype); +TORCH_API at::Tensor & norm_ScalarOpt_dtype_out(const at::Tensor & self, const ::std::optional & p, at::ScalarType dtype, at::Tensor & out); +TORCH_API at::Tensor norm(const at::Tensor & self, const at::Scalar & p=2); +TORCH_API at::Tensor & norm_Scalar_out(const at::Tensor & self, const at::Scalar & p, at::Tensor & out); +struct TORCH_API structured_norm_dtype_out : public at::meta::structured_norm_ScalarOpt_dim_dtype { +void impl(const at::Tensor & self, at::OptionalScalarRef p, at::IntArrayRef dim, bool keepdim, at::ScalarType dtype, const at::Tensor & out); +}; +TORCH_API at::Tensor sparse_dtype_norm(const at::Tensor & self, const ::std::optional & p, at::IntArrayRef dim, bool keepdim, at::ScalarType dtype); +struct TORCH_API structured_norm_out : public at::meta::structured_norm_ScalarOpt_dim { +void impl(const at::Tensor & self, at::OptionalScalarRef p, at::IntArrayRef dim, bool keepdim, const at::Tensor & out); +}; +TORCH_API at::Tensor sparse_norm(const at::Tensor & self, const ::std::optional & p, at::IntArrayRef dim, bool keepdim=false); +TORCH_API at::Tensor norm(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype); +TORCH_API at::Tensor & norm_out(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype, at::Tensor & out); +TORCH_API at::Tensor norm(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim=false); +TORCH_API at::Tensor & norm_out(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ones_like.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ones_like.h new file mode 100644 index 0000000000000000000000000000000000000000..09363d5b8526eec591fc5997c31489267876dd20 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/ones_like.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor ones_like(const at::Tensor & self, at::TensorOptions options={}, ::std::optional memory_format=::std::nullopt) { + return at::_ops::ones_like::call(self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); +} +// aten::ones_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor ones_like(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format) { + return at::_ops::ones_like::call(self, dtype, layout, device, pin_memory, memory_format); +} + +// aten::ones_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & ones_like_out(at::Tensor & out, const at::Tensor & self, ::std::optional memory_format=::std::nullopt) { + return at::_ops::ones_like_out::call(self, memory_format, out); +} +// aten::ones_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & ones_like_outf(const at::Tensor & self, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::ones_like_out::call(self, memory_format, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/pin_memory_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/pin_memory_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0d88e78a9928b805ab5f82a5647f4392043098e7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/pin_memory_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API pin_memory { + using schema = at::Tensor (const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::pin_memory"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, ::std::optional device); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional device); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..afd8ffc3aa48640cfbd697fcfe4c54a6991c10b8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_shuffle_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor pixel_shuffle(const at::Tensor & self, int64_t upscale_factor); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4a1e7e202e07cabbc542b6ece25bc621dd1c3add --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & poisson_out(at::Tensor & out, const at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & poisson_outf(const at::Tensor & self, ::std::optional generator, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f5094d41c3fd272b7244d514365d1344725e2df0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & poisson_out(const at::Tensor & self, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor _s_poisson_cpu(const at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor _s_poisson_cuda(const at::Tensor & self, ::std::optional generator=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/polar_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/polar_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cca96a403edc4ef42e7d40c34c8797761d405150 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/polar_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & polar_out(at::Tensor & out, const at::Tensor & abs, const at::Tensor & angle); +TORCH_API at::Tensor & polar_outf(const at::Tensor & abs, const at::Tensor & angle, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/positive.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/positive.h new file mode 100644 index 0000000000000000000000000000000000000000..a9727581c1888aa5c86fd276f3de4d10533cffbe --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/positive.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::positive(Tensor(a) self) -> Tensor(a) +inline at::Tensor positive(const at::Tensor & self) { + return at::_ops::positive::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/prelu.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/prelu.h new file mode 100644 index 0000000000000000000000000000000000000000..a246fe8f7cbd882e080d33878029cf71495e3ecd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/prelu.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::prelu(Tensor self, Tensor weight) -> Tensor +inline at::Tensor prelu(const at::Tensor & self, const at::Tensor & weight) { + return at::_ops::prelu::call(self, weight); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/q_per_channel_axis.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/q_per_channel_axis.h new file mode 100644 index 0000000000000000000000000000000000000000..820adab19470d0287dc8f53bca0748516caf07d3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/q_per_channel_axis.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::q_per_channel_axis(Tensor self) -> int +inline int64_t q_per_channel_axis(const at::Tensor & self) { + return at::_ops::q_per_channel_axis::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/qr_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/qr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1c833024ceb6132438c7b9e973fc6de1189038a3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/qr_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API qr_Q { + using schema = ::std::tuple (const at::Tensor &, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::qr"; + static constexpr const char* overload_name = "Q"; + static constexpr const char* schema_str = "qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)"; + static ::std::tuple call(const at::Tensor & self, bool some, at::Tensor & Q, at::Tensor & R); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool some, at::Tensor & Q, at::Tensor & R); +}; + +struct TORCH_API qr { + using schema = ::std::tuple (const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::qr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)"; + static ::std::tuple call(const at::Tensor & self, bool some); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool some); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7d35c52e7de7b184712cfbe9050d0b6b7d4e6bd2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & quantize_per_channel_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::ScalarType dtype); +TORCH_API at::Tensor & quantize_per_channel_outf(const at::Tensor & self, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::ScalarType dtype, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..462c749506b0c7f75958b845e0d2857b55da34ea --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor quantize_per_channel(const at::Tensor & self, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::ScalarType dtype); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_gru_cell_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_gru_cell_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..56b828db2704ed477a7fe536fe675877b98a8cd1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_gru_cell_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API quantized_gru_cell { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::quantized_gru_cell"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_lstm_cell.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_lstm_cell.h new file mode 100644 index 0000000000000000000000000000000000000000..bf278acab96f97f944f45c3713d26a31edd81721 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_lstm_cell.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor) +inline ::std::tuple quantized_lstm_cell(const at::Tensor & input, at::TensorList hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh) { + return at::_ops::quantized_lstm_cell::call(input, hx, w_ih, w_hh, b_ih, b_hh, packed_ih, packed_hh, col_offsets_ih, col_offsets_hh, scale_ih, scale_hh, zero_point_ih, zero_point_hh); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool2d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool2d.h new file mode 100644 index 0000000000000000000000000000000000000000..0d78ac59e59dc61b49ace81989edb0059fed0f57 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool2d.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor +inline at::Tensor quantized_max_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) { + return at::_ops::quantized_max_pool2d::call(self, kernel_size, stride, padding, dilation, ceil_mode); +} + +// aten::quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & quantized_max_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) { + return at::_ops::quantized_max_pool2d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out); +} +// aten::quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & quantized_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) { + return at::_ops::quantized_max_pool2d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_rnn_tanh_cell_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_rnn_tanh_cell_native.h new file mode 100644 index 0000000000000000000000000000000000000000..505c6411a6bd44b99d7efd592bb6bd521a1f1175 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_rnn_tanh_cell_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor quantized_rnn_tanh_cell(const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/randn_like.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/randn_like.h new file mode 100644 index 0000000000000000000000000000000000000000..8670ab757c5463e4ba11265c9264a48eab4166e4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/randn_like.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor randn_like(const at::Tensor & self, at::TensorOptions options={}, ::std::optional memory_format=::std::nullopt) { + return at::_ops::randn_like::call(self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); +} +// aten::randn_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor randn_like(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format) { + return at::_ops::randn_like::call(self, dtype, layout, device, pin_memory, memory_format); +} + +// aten::randn_like.generator(Tensor self, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor randn_like(const at::Tensor & self, ::std::optional generator, at::TensorOptions options={}, ::std::optional memory_format=::std::nullopt) { + return at::_ops::randn_like_generator::call(self, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); +} +// aten::randn_like.generator(Tensor self, *, Generator? generator, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor randn_like(const at::Tensor & self, ::std::optional generator, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format) { + return at::_ops::randn_like_generator::call(self, generator, dtype, layout, device, pin_memory, memory_format); +} + +// aten::randn_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & randn_like_out(at::Tensor & out, const at::Tensor & self, ::std::optional memory_format=::std::nullopt) { + return at::_ops::randn_like_out::call(self, memory_format, out); +} +// aten::randn_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & randn_like_outf(const at::Tensor & self, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::randn_like_out::call(self, memory_format, out); +} + +// aten::randn_like.generator_out(Tensor self, *, Generator? generator, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & randn_like_out(at::Tensor & out, const at::Tensor & self, ::std::optional generator, ::std::optional memory_format=::std::nullopt) { + return at::_ops::randn_like_generator_out::call(self, generator, memory_format, out); +} +// aten::randn_like.generator_out(Tensor self, *, Generator? generator, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & randn_like_outf(const at::Tensor & self, ::std::optional generator, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::randn_like_generator_out::call(self, generator, memory_format, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/randn_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/randn_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9dc867cac0dfd1cccb0903f6585a9c3afc010b78 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/randn_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & randn_out(at::IntArrayRef size, at::Tensor & out); +TORCH_API at::Tensor randn(at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randn_out(at::IntArrayRef size, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor randn(at::IntArrayRef size, ::std::optional generator, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor randn(at::IntArrayRef size, ::std::optional names, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randn_names_out_symint(c10::SymIntArrayRef size, ::std::optional names, at::Tensor & out); +TORCH_API at::Tensor randn(at::IntArrayRef size, ::std::optional generator, ::std::optional names, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randn_generator_with_names_out_symint(c10::SymIntArrayRef size, ::std::optional generator, ::std::optional names, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/random.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/random.h new file mode 100644 index 0000000000000000000000000000000000000000..f59cfa4ca4c4e4b2d0cd6629acb454ef05d1bca6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/random.h @@ -0,0 +1,73 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::random.from_out(Tensor self, int from, int? to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_out(at::Tensor & out, const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt) { + return at::_ops::random_from_out::call(self, from, to, generator, out); +} +// aten::random.from_out(Tensor self, int from, int? to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_outf(const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator, at::Tensor & out) { + return at::_ops::random_from_out::call(self, from, to, generator, out); +} + +// aten::random.from(Tensor self, int from, int? to, *, Generator? generator=None) -> Tensor +inline at::Tensor random(const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt) { + return at::_ops::random_from::call(self, from, to, generator); +} + +// aten::random.to_out(Tensor self, int to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_out(at::Tensor & out, const at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt) { + return at::_ops::random_to_out::call(self, to, generator, out); +} +// aten::random.to_out(Tensor self, int to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_outf(const at::Tensor & self, int64_t to, ::std::optional generator, at::Tensor & out) { + return at::_ops::random_to_out::call(self, to, generator, out); +} + +// aten::random.to(Tensor self, int to, *, Generator? generator=None) -> Tensor +inline at::Tensor random(const at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt) { + return at::_ops::random_to::call(self, to, generator); +} + +// aten::random.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_out(at::Tensor & out, const at::Tensor & self, ::std::optional generator=::std::nullopt) { + return at::_ops::random_out::call(self, generator, out); +} +// aten::random.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_outf(const at::Tensor & self, ::std::optional generator, at::Tensor & out) { + return at::_ops::random_out::call(self, generator, out); +} + +// aten::random(Tensor self, *, Generator? generator=None) -> Tensor +inline at::Tensor random(const at::Tensor & self, ::std::optional generator=::std::nullopt) { + return at::_ops::random::call(self, generator); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/random_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/random_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4300cc3bca26b9035cb2247303e73bf23beb1356 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/random_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & random_(at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_(at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_(at::Tensor & self, ::std::optional generator=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/random_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/random_native.h new file mode 100644 index 0000000000000000000000000000000000000000..69f22285c64ec9f2ff8edf411b9bef6cf5595bcf --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/random_native.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor random(const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_from_out(const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor & random_(at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_meta_(at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor random(const at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_to_out(const at::Tensor & self, int64_t to, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor & random_(at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_meta_(at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor random(const at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_out(const at::Tensor & self, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor & random_(at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_meta_(at::Tensor & self, ::std::optional generator=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/randperm_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/randperm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..63c5039a7fabec6c312a32d44b2b6fafdb60f9b6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/randperm_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor randperm(int64_t n, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randperm_out(int64_t n, at::Tensor & out); +TORCH_API at::Tensor randperm(int64_t n, ::std::optional generator, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randperm_out_cpu(int64_t n, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor & randperm_out_cuda(int64_t n, ::std::optional generator, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reciprocal_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reciprocal_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..52af20b1c6299f7f7326b9c8de5325a2fbafe183 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reciprocal_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor reciprocal(const at::Tensor & self); +TORCH_API at::Tensor & reciprocal_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & reciprocal_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & reciprocal_(at::Tensor & self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reciprocal_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reciprocal_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ebd90b12b94dc4233c9dfbdc71a22a2bb2449411 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reciprocal_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor reciprocal(const at::Tensor & self); +TORCH_API at::Tensor & reciprocal_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & reciprocal_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & reciprocal_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/refine_names.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/refine_names.h new file mode 100644 index 0000000000000000000000000000000000000000..132150bdfb1ee9a9b32856e84a7d7d8782872eb3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/refine_names.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/refine_names_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/refine_names_native.h new file mode 100644 index 0000000000000000000000000000000000000000..15d13b42f65bc54988404e572389c02a0666fd51 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/refine_names_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor refine_names(const at::Tensor & self, at::DimnameList names); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c683799b2a580ebf2597ba5029a487fd42675733 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor reflection_pad1d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor reflection_pad1d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..97ac74755dff5e11d9f7ce51133b63c0463d3256 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor reflection_pad1d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor reflection_pad1d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & reflection_pad1d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +TORCH_API at::Tensor & reflection_pad1d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad1d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..017125aa95721580d1d97adf920f46bbe0917df8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_reflection_pad1d_backward : public at::impl::MetaBase { + + + void meta(const at::Tensor & grad_output, const at::Tensor & self, at::ArrayRef padding); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad2d_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad2d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..27223fb687e23d447e6ec5d3e92a854c03f8beeb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad2d_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API reflection_pad2d_out { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::reflection_pad2d"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out); +}; + +struct TORCH_API reflection_pad2d { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::reflection_pad2d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor"; + static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef padding); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..6471d0566257f4f317d11371d399309a2240bd38 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); + } +} + +// aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); + } +} + +// aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad3d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, padding, grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, padding, grad_input); + } +} + +// aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad3d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, padding, grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, padding, grad_input); + } +} + +// aten::reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor +inline at::Tensor reflection_pad3d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad3d_backward::call(grad_output, self, c10::fromIntArrayRefSlow(padding)); +} +namespace symint { + template >> + at::Tensor reflection_pad3d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad3d_backward::call(grad_output, self, c10::fromIntArrayRefSlow(padding)); + } +} + +// aten::reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor +inline at::Tensor reflection_pad3d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad3d_backward::call(grad_output, self, padding); +} +namespace symint { + template >> + at::Tensor reflection_pad3d_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad3d_backward::call(grad_output, self, padding); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/relu6_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/relu6_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0b8c5e8626fae2fa358a3afc322023e535828761 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/relu6_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor relu6(const at::Tensor & self); +TORCH_API at::Tensor & relu6_(at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/renorm_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/renorm_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a2e323d1522d7805d2e7a0b37ef7ff88053002ae --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/renorm_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor renorm(const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm); +TORCH_API at::Tensor & renorm_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm); +TORCH_API at::Tensor & renorm_outf(const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm, at::Tensor & out); +TORCH_API at::Tensor & renorm_(at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d.h new file mode 100644 index 0000000000000000000000000000000000000000..476f4eb42f4b0885f739004cbedbf89b06cab8af --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template >> + at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template >> + at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, padding, out); +} +namespace symint { + template >> + at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, padding, out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, padding, out); +} +namespace symint { + template >> + at::Tensor & replication_pad1d_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, padding, out); + } +} + +// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); +} +namespace symint { + template >> + at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); + } +} + +// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor replication_pad1d_symint(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, padding); +} +namespace symint { + template >> + at::Tensor replication_pad1d(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, padding); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d_backward_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d_backward_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c27c13d85d0785cb2a4ef519a252101d0aaec312 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d_backward_meta_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor replication_pad1d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor replication_pad1d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & replication_pad1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & replication_pad1d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +TORCH_API at::Tensor & replication_pad1d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & replication_pad1d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..a755ac6b610577292fee95e07610badad5b02214 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_backward_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor replication_pad2d_backward_cpu(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & replication_pad2d_backward_out_cpu(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +TORCH_API at::Tensor replication_pad2d_backward_cuda(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & replication_pad2d_backward_out_cuda(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7f93dc1e2510491a751ec9f066a6d431cd08dacb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_replication_pad2d_out_cpu : public at::meta::structured_replication_pad2d { +void impl(const at::Tensor & self, at::ArrayRef padding, const at::Tensor & out); +}; +struct TORCH_API structured_replication_pad2d_out_cuda : public at::meta::structured_replication_pad2d { +void impl(const at::Tensor & self, at::ArrayRef padding, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj.h new file mode 100644 index 0000000000000000000000000000000000000000..1716001e1491d005a32bd07f1841e47d1869323b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::resolve_conj(Tensor(a) self) -> Tensor(a) +inline at::Tensor resolve_conj(const at::Tensor & self) { + return at::_ops::resolve_conj::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj_native.h new file mode 100644 index 0000000000000000000000000000000000000000..87c273698856be068e08a3e61afb9b0f5823e621 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/resolve_conj_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor resolve_conj(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rms_norm_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rms_norm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..74841080c607e5a7207580605355b58e1c32ac2c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rms_norm_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API rms_norm { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, const ::std::optional &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::rms_norm"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "rms_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor"; + static at::Tensor call(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, ::std::optional eps); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, ::std::optional eps); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_relu_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_relu_native.h new file mode 100644 index 0000000000000000000000000000000000000000..568f16aa3c8682ee9a04f704f3bf3fad827e427d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_relu_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple rnn_relu(const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first); +TORCH_API ::std::tuple rnn_relu(const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh.h new file mode 100644 index 0000000000000000000000000000000000000000..119c29f2182ab840e7d8b89cf835ce4e9189da52 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) +inline ::std::tuple rnn_tanh(const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::rnn_tanh_input::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first); +} + +// aten::rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) +inline ::std::tuple rnn_tanh(const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) { + return at::_ops::rnn_tanh_data::call(data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh_cell_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh_cell_native.h new file mode 100644 index 0000000000000000000000000000000000000000..15e7ebc65529d456a9a29245e94a010433129e6e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/rnn_tanh_cell_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor rnn_tanh_cell(const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const ::std::optional & b_ih={}, const ::std::optional & b_hh={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/round_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/round_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8415e5b065dc0d04d364a860e6f9d0fabb3bd644 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/round_cuda_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor round(const at::Tensor & self); +TORCH_API at::Tensor & round_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & round_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & round_(at::Tensor & self); +TORCH_API at::Tensor round(const at::Tensor & self, int64_t decimals); +TORCH_API at::Tensor & round_out(at::Tensor & out, const at::Tensor & self, int64_t decimals); +TORCH_API at::Tensor & round_outf(const at::Tensor & self, int64_t decimals, at::Tensor & out); +TORCH_API at::Tensor & round_(at::Tensor & self, int64_t decimals); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e07f05ce2d8a210161e28d79ccb1f5b577b42a40 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor row_indices(const at::Tensor & self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scalar_tensor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scalar_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..fca9c132c977795534152c4d544fa30bbcc5c98d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scalar_tensor.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor scalar_tensor(const at::Scalar & s, at::TensorOptions options={}) { + return at::_ops::scalar_tensor::call(s, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor scalar_tensor(const at::Scalar & s, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::scalar_tensor::call(s, dtype, layout, device, pin_memory); +} + +// aten::scalar_tensor.out(Scalar s, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & scalar_tensor_out(at::Tensor & out, const at::Scalar & s) { + return at::_ops::scalar_tensor_out::call(s, out); +} +// aten::scalar_tensor.out(Scalar s, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & scalar_tensor_outf(const at::Scalar & s, at::Tensor & out) { + return at::_ops::scalar_tensor_out::call(s, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_native.h new file mode 100644 index 0000000000000000000000000000000000000000..62ca3a45f969329d37a4a11dd5f5a691ee047776 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_scatter_add : public at::meta::structured_scatter_add { +void impl(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, const at::Tensor & out); +}; +TORCH_API at::Tensor scatter_add(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & src); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..240681f3864c96bc9acaa270e141ef1064b0fce3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor scatter_reduce(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true); +TORCH_API at::Tensor & scatter_reduce_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true); +TORCH_API at::Tensor & scatter_reduce_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out); +TORCH_API at::Tensor & scatter_reduce_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7ece27168ab5549fd1bf7fbf882113e2a282a2ba --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API scatter_reduce_two { + using schema = at::Tensor (const at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, c10::string_view, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_reduce"; + static constexpr const char* overload_name = "two"; + static constexpr const char* schema_str = "scatter_reduce.two(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self); +}; + +struct TORCH_API scatter_reduce__two { + using schema = at::Tensor & (at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, c10::string_view, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_reduce_"; + static constexpr const char* overload_name = "two"; + static constexpr const char* schema_str = "scatter_reduce_.two(Tensor(a!) self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self); +}; + +struct TORCH_API scatter_reduce_two_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, c10::string_view, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_reduce"; + static constexpr const char* overload_name = "two_out"; + static constexpr const char* schema_str = "scatter_reduce.two_out(Tensor self, int dim, Tensor index, Tensor src, str reduce, *, bool include_self=True, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/segment_reduce_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/segment_reduce_native.h new file mode 100644 index 0000000000000000000000000000000000000000..386a048fb9079bbb8dbbbfc6ef420b82ed8f5027 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/segment_reduce_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & segment_reduce_out(const at::Tensor & data, c10::string_view reduce, const ::std::optional & lengths, const ::std::optional & indices, const ::std::optional & offsets, int64_t axis, bool unsafe, const ::std::optional & initial, at::Tensor & out); +TORCH_API at::Tensor segment_reduce_kernel(const at::Tensor & data, c10::string_view reduce, const ::std::optional & lengths={}, const ::std::optional & indices={}, const ::std::optional & offsets={}, int64_t axis=0, bool unsafe=false, const ::std::optional & initial=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/set.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/set.h new file mode 100644 index 0000000000000000000000000000000000000000..e03dd0480212a49e57577e9458547c086a42f6b8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/set.h @@ -0,0 +1,167 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +namespace symint { + template >> + at::Tensor & set_(at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set__source_Storage_storage_offset::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride)); + } +} + +namespace symint { + template >> + at::Tensor & set_(at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set__source_Storage_storage_offset::call(self, source, storage_offset, size, stride); + } +} + +namespace symint { + template >> + at::Tensor & set_(at::Tensor & self, const at::Tensor & source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set__source_Tensor_storage_offset::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride)); + } +} + +namespace symint { + template >> + at::Tensor & set_(at::Tensor & self, const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set__source_Tensor_storage_offset::call(self, source, storage_offset, size, stride); + } +} + +// aten::set.source_Storage_out(Tensor self, Storage source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source) { + return at::_ops::set_source_Storage_out::call(self, source, out); +} +// aten::set.source_Storage_out(Tensor self, Storage source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_outf(const at::Tensor & self, at::Storage source, at::Tensor & out) { + return at::_ops::set_source_Storage_out::call(self, source, out); +} + +// aten::set.source_Storage(Tensor self, Storage source) -> Tensor +inline at::Tensor set(const at::Tensor & self, at::Storage source) { + return at::_ops::set_source_Storage::call(self, source); +} + +// aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out); +} +namespace symint { + template >> + at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out); + } +} + +// aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_outf(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out); +} +namespace symint { + template >> + at::Tensor & set_outf(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out); + } +} + +// aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_symint_out(at::Tensor & out, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, size, stride, out); +} +namespace symint { + template >> + at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, size, stride, out); + } +} + +// aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_symint_outf(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, size, stride, out); +} +namespace symint { + template >> + at::Tensor & set_outf(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, size, stride, out); + } +} + +// aten::set.source_Storage_storage_offset(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor +inline at::Tensor set(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride)); +} +namespace symint { + template >> + at::Tensor set(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride)); + } +} + +// aten::set.source_Storage_storage_offset(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor +inline at::Tensor set_symint(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset::call(self, source, storage_offset, size, stride); +} +namespace symint { + template >> + at::Tensor set(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset::call(self, source, storage_offset, size, stride); + } +} + +// aten::set.source_Tensor_out(Tensor self, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & source) { + return at::_ops::set_source_Tensor_out::call(self, source, out); +} +// aten::set.source_Tensor_out(Tensor self, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_outf(const at::Tensor & self, const at::Tensor & source, at::Tensor & out) { + return at::_ops::set_source_Tensor_out::call(self, source, out); +} + +// aten::set.source_Tensor(Tensor self, Tensor source) -> Tensor +inline at::Tensor set(const at::Tensor & self, const at::Tensor & source) { + return at::_ops::set_source_Tensor::call(self, source); +} + +// aten::set.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::set_out::call(self, out); +} +// aten::set.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::set_out::call(self, out); +} + +// aten::set(Tensor self) -> Tensor +inline at::Tensor set(const at::Tensor & self) { + return at::_ops::set::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/set_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/set_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..49701c440080348ec412c01b51dcd31a4b1c7744 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/set_ops.h @@ -0,0 +1,166 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API set__source_Storage { + using schema = at::Tensor & (at::Tensor &, at::Storage); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = "source_Storage"; + static constexpr const char* schema_str = "set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, at::Storage source); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Storage source); +}; + +struct TORCH_API set__source_Storage_storage_offset { + using schema = at::Tensor & (at::Tensor &, at::Storage, c10::SymInt, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = "source_Storage_storage_offset"; + static constexpr const char* schema_str = "set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); +}; + +struct TORCH_API set__source_Tensor_storage_offset { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &, c10::SymInt, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = "source_Tensor_storage_offset"; + static constexpr const char* schema_str = "set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); +}; + +struct TORCH_API set__source_Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = "source_Tensor"; + static constexpr const char* schema_str = "set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & source); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & source); +}; + +struct TORCH_API set_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "set_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API set_source_Storage_out { + using schema = at::Tensor & (const at::Tensor &, at::Storage, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Storage_out"; + static constexpr const char* schema_str = "set.source_Storage_out(Tensor self, Storage source, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Storage source, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, at::Tensor & out); +}; + +struct TORCH_API set_source_Storage { + using schema = at::Tensor (const at::Tensor &, at::Storage); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Storage"; + static constexpr const char* schema_str = "set.source_Storage(Tensor self, Storage source) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::Storage source); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source); +}; + +struct TORCH_API set_source_Storage_storage_offset_out { + using schema = at::Tensor & (const at::Tensor &, at::Storage, c10::SymInt, c10::SymIntArrayRef, c10::SymIntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Storage_storage_offset_out"; + static constexpr const char* schema_str = "set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out); +}; + +struct TORCH_API set_source_Storage_storage_offset { + using schema = at::Tensor (const at::Tensor &, at::Storage, c10::SymInt, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Storage_storage_offset"; + static constexpr const char* schema_str = "set.source_Storage_storage_offset(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); +}; + +struct TORCH_API set_source_Tensor_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Tensor_out"; + static constexpr const char* schema_str = "set.source_Tensor_out(Tensor self, Tensor source, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & source, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & source, at::Tensor & out); +}; + +struct TORCH_API set_source_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Tensor"; + static constexpr const char* schema_str = "set.source_Tensor(Tensor self, Tensor source) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & source); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & source); +}; + +struct TORCH_API set_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "set.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +struct TORCH_API set { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "set(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sgn_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sgn_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5a5b46f021317bfdc8a8a81e241fa6ea5799ddb3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sgn_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor sgn(const at::Tensor & self); +TORCH_API at::Tensor & sgn_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & sgn_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & sgn_(at::Tensor & self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/silu.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/silu.h new file mode 100644 index 0000000000000000000000000000000000000000..e40f5ef108e0b9cef30c1afe8703e248aedda5f2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/silu.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::silu(Tensor self) -> Tensor +inline at::Tensor silu(const at::Tensor & self) { + return at::_ops::silu::call(self); +} + +// aten::silu_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & silu_(at::Tensor & self) { + return at::_ops::silu_::call(self); +} + +// aten::silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & silu_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::silu_out::call(self, out); +} +// aten::silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & silu_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::silu_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/silu_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/silu_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8ad7819c71a4b62a8f346550817e51216d0e5695 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/silu_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API silu_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::silu_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input); +}; + +struct TORCH_API silu_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::silu_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "silu_backward(Tensor grad_output, Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/silu_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/silu_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..7b0eb00bdf69acc7dc935f62a4590474eca07cbd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/silu_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_silu : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sin_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sin_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b2251a63ac29ccf06494eb8df8622736a754a08b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sin_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor sin(const at::Tensor & self); +TORCH_API at::Tensor & sin_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & sin_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & sin_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sinc_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sinc_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..818dcfdb4b46c387d1f389b544308a518dbf269f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sinc_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor sinc(const at::Tensor & self); +TORCH_API at::Tensor & sinc_(at::Tensor & self); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/size_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/size_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..79827da20501da1fc2eedcebd880cd82e59bf4aa --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/size_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API size_int { + using schema = int64_t (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::size"; + static constexpr const char* overload_name = "int"; + static constexpr const char* schema_str = "size.int(Tensor self, int dim) -> int"; + static int64_t call(const at::Tensor & self, int64_t dim); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim); +}; + +struct TORCH_API size_Dimname { + using schema = int64_t (const at::Tensor &, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::size"; + static constexpr const char* overload_name = "Dimname"; + static constexpr const char* schema_str = "size.Dimname(Tensor self, Dimname dim) -> int"; + static int64_t call(const at::Tensor & self, at::Dimname dim); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slice_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slice_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f4f10eca089c0306ffab90fb54093808a993c8e6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slice_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor slice_backward(const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step); +TORCH_API at::Tensor & slice_backward_out_symint(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt start, c10::SymInt end, c10::SymInt step, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse.h new file mode 100644 index 0000000000000000000000000000000000000000..29a86e4eef1f7309606cb092256b05211845e84f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) +inline at::Tensor slice_inverse(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, int64_t step=1) { + return at::_ops::slice_inverse::call(self, src, dim, start.has_value() ? ::std::make_optional(c10::SymInt(*start)) : ::std::nullopt, end.has_value() ? ::std::make_optional(c10::SymInt(*end)) : ::std::nullopt, step); +} +namespace symint { + template >> + at::Tensor slice_inverse(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, int64_t step=1) { + return at::_ops::slice_inverse::call(self, src, dim, start.has_value() ? ::std::make_optional(c10::SymInt(*start)) : ::std::nullopt, end.has_value() ? ::std::make_optional(c10::SymInt(*end)) : ::std::nullopt, step); + } +} + +// aten::slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) +inline at::Tensor slice_inverse_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, c10::SymInt step=1) { + return at::_ops::slice_inverse::call(self, src, dim, start, end, step); +} +namespace symint { + template >> + at::Tensor slice_inverse(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, c10::SymInt step=1) { + return at::_ops::slice_inverse::call(self, src, dim, start, end, step); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slice_scatter_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slice_scatter_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b186bfa3d4d7806e8108ad2285be9c9d38765595 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slice_scatter_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & slice_scatter_out_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim, ::std::optional start, ::std::optional end, c10::SymInt step, at::Tensor & out); +TORCH_API at::Tensor slice_scatter(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, int64_t step=1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7f4e0b4ae360fbd04b38ad67cde7c655500f73ff --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor slow_conv3d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0); +TORCH_API at::Tensor slow_conv3d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0)); +TORCH_API at::Tensor & slow_conv3d_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0); +TORCH_API at::Tensor & slow_conv3d_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out); +TORCH_API at::Tensor & slow_conv3d_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0)); +TORCH_API at::Tensor & slow_conv3d_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_forward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_forward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d7400b77e89c7cdedc67ecce69dbaabd5fd5fac0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_forward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor slow_conv3d_forward_cpu(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding); +TORCH_API at::Tensor & slow_conv3d_forward_out_cpu(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & output); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..6494b3d433d7ef3aa3a5435170bdcfc9685034bc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor slow_conv3d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0); +TORCH_API at::Tensor & slow_conv3d_out(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_dilated2d_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_dilated2d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3a2d42398c13fa361870d8d3503694fd5590fd20 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_dilated2d_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor slow_conv_dilated2d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1); +TORCH_API at::Tensor slow_conv_dilated2d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f948c61e0902061f7ed6cec000648b106a50cf45 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor slow_conv_transpose2d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1); +TORCH_API at::Tensor slow_conv_transpose2d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c7838ee18c49287188f031d69c0bc7313f1a20d2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor smooth_l1_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double beta=1.0); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ca9e947fe85c7848d88b2eec17e93386437f67f8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor smooth_l1_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double beta=1.0); +TORCH_API at::Tensor & smooth_l1_loss_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double beta=1.0); +TORCH_API at::Tensor & smooth_l1_loss_outf(const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5d55aadc9cd745fe44c7365c7f225e8acdc06d9e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor softplus_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); +TORCH_API at::Tensor & softplus_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); +TORCH_API at::Tensor & softplus_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & grad_input); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4a6b9a1aa406912e9cf2576145f40d59ba81008b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API softplus_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::softplus"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & out); +}; + +struct TORCH_API softplus { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::softplus"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sort_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sort_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5986abc775aa12061904dc79ae4ef26a96e71316 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sort_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API ::std::tuple sort(const at::Tensor & self, ::std::optional stable, int64_t dim=-1, bool descending=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sort_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sort_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..337864399a78342898073c9d3efee9c0ce456051 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sort_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API ::std::tuple sort(const at::Tensor & self, ::std::optional stable, int64_t dim=-1, bool descending=false); +TORCH_API ::std::tuple sort_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, ::std::optional stable, int64_t dim=-1, bool descending=false); +TORCH_API ::std::tuple sort_outf(const at::Tensor & self, ::std::optional stable, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sort_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sort_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7db065e2087343b78e6aab64ed8a25667df997b7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sort_ops.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API sort_values { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "values"; + static constexpr const char* schema_str = "sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API sort_values_stable { + using schema = ::std::tuple (const at::Tensor &, ::std::optional, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "values_stable"; + static constexpr const char* schema_str = "sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, ::std::optional stable, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional stable, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API sort { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool descending); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool descending); +}; + +struct TORCH_API sort_stable { + using schema = ::std::tuple (const at::Tensor &, ::std::optional, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "stable"; + static constexpr const char* schema_str = "sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, ::std::optional stable, int64_t dim, bool descending); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional stable, int64_t dim, bool descending); +}; + +struct TORCH_API sort_dimname_values { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "dimname_values"; + static constexpr const char* schema_str = "sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API sort_dimname_values_stable { + using schema = ::std::tuple (const at::Tensor &, ::std::optional, at::Dimname, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "dimname_values_stable"; + static constexpr const char* schema_str = "sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, ::std::optional stable, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional stable, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API sort_dimname { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "dimname"; + static constexpr const char* schema_str = "sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool descending); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool descending); +}; + +struct TORCH_API sort_dimname_stable { + using schema = ::std::tuple (const at::Tensor &, ::std::optional, at::Dimname, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "dimname_stable"; + static constexpr const char* schema_str = "sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, ::std::optional stable, at::Dimname dim, bool descending); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional stable, at::Dimname dim, bool descending); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsc_tensor_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsc_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..85b9d715b3330f41b9a267d8be83002834940750 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsc_tensor_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor sparse_bsc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor sparse_bsc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..37913ff300d00cf8bd67588f2cddab5a7c6ed250 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor sparse_bsr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor sparse_bsr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..e9c02910cd79a5376ebf536b06a7f7215e5b132d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API sparse_bsr_tensor_crow_col_value_size { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sparse_bsr_tensor"; + static constexpr const char* overload_name = "crow_col_value_size"; + static constexpr const char* schema_str = "sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor"; + static at::Tensor call(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API sparse_bsr_tensor_crow_col_value { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sparse_bsr_tensor"; + static constexpr const char* overload_name = "crow_col_value"; + static constexpr const char* schema_str = "sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor"; + static at::Tensor call(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_coo_tensor.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_coo_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..b00fab9f947d27b951279cd39c424e9cf1f452ba --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_coo_tensor.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor sparse_coo_tensor(at::IntArrayRef size, at::TensorOptions options) { + return at::_ops::sparse_coo_tensor_size::call(size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor sparse_coo_tensor(at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::sparse_coo_tensor_size::call(size, dtype, layout, device, pin_memory); +} + +// aten::sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor +inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options={}, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::sparse_coo_tensor_indices::call(indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced); +} +// aten::sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor +inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional is_coalesced) { + return at::_ops::sparse_coo_tensor_indices::call(indices, values, dtype, layout, device, pin_memory, is_coalesced); +} + +// aten::sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor +inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::sparse_coo_tensor_indices_size::call(indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced); +} +// aten::sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor +inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional is_coalesced) { + return at::_ops::sparse_coo_tensor_indices_size::call(indices, values, size, dtype, layout, device, pin_memory, is_coalesced); +} + +// aten::sparse_coo_tensor.size_out(int[] size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & sparse_coo_tensor_out(at::Tensor & out, at::IntArrayRef size) { + return at::_ops::sparse_coo_tensor_size_out::call(size, out); +} +// aten::sparse_coo_tensor.size_out(int[] size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & sparse_coo_tensor_outf(at::IntArrayRef size, at::Tensor & out) { + return at::_ops::sparse_coo_tensor_size_out::call(size, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_dim.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_dim.h new file mode 100644 index 0000000000000000000000000000000000000000..2f094b4f7fdfcb46268afa5987578f2e35fe6edb --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_dim.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3e6767d9353ac856669de7431f3bc2ec992185ed --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_compositeexplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor sparse_resize(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); +TORCH_API const at::Tensor & sparse_resize_out(const at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); +TORCH_API const at::Tensor & sparse_resize_outf(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim, const at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..12345269c4d3365702bc4f971e6d6e047b7bcf56 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor special_bessel_j0(const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_j0_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_j0_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bd227e18fd8411a52977bc99c323e008043abe76 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_bessel_j0 { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_j0"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_bessel_j0(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API special_bessel_j0_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_j0"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j1_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j1_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f9fa1b85b2c2cc25a14f5d98db8d0cdd140717e1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j1_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_bessel_j1 { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_j1"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_bessel_j1(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API special_bessel_j1_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_j1"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..986d60cb7e9facb2ab3859fa3a5ce800f6493194 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_bessel_y0(const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y0_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y0_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1bc66a6ebc2ab6f913fd8c9032857ba1238418cc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor special_bessel_y1(const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y1_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y1_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..03cbf711476beb57ce2615b06c2a29495da1375f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_bessel_y1(const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y1_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y1_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_native.h new file mode 100644 index 0000000000000000000000000000000000000000..6ae1bd2ff006f91e5a11bc83191a8edb5e336bf6 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_special_bessel_y1_out : public at::meta::structured_special_bessel_y1 { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d1c555c260e2d9158ec2b61f87d19c2be9c37042 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_bessel_y1 { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_y1"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_bessel_y1(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API special_bessel_y1_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_y1"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7306437121594c70e2fde4899234967af1e33d0d --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor special_chebyshev_polynomial_u(const at::Scalar & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_out(at::Tensor & out, const at::Scalar & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_outf(const at::Scalar & x, const at::Tensor & n, at::Tensor & out); +TORCH_API at::Tensor special_chebyshev_polynomial_u(const at::Tensor & x, const at::Scalar & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_out(at::Tensor & out, const at::Tensor & x, const at::Scalar & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_outf(const at::Tensor & x, const at::Scalar & n, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..960010c24191f2429b2be291718e19443a379cd9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor special_chebyshev_polynomial_u(const at::Tensor & x, const at::Tensor & n); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..65e27a1cce3e2e180ab32ddd8d94b10c1a292555 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_chebyshev_polynomial_u(const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..03a41b18f3ecfb3282a2a20fff83479809caf253 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor special_chebyshev_polynomial_v(const at::Tensor & x, const at::Tensor & n); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..12161eef9dc647334bdfc539d73bc6b2defb3e00 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_digamma_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_digamma(const at::Tensor & self); +TORCH_API at::Tensor & special_digamma_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_digamma_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_entr_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_entr_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..49107dc0f7e4bc4c03704f50baef352680f56a7f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_entr_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor special_entr(const at::Tensor & self); +TORCH_API at::Tensor & special_entr_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_entr_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_erfcx_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_erfcx_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f4e62c273418c4220988b313124beffd1eb64ed5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_erfcx_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_special_erfcx_out : public at::meta::structured_special_erfcx { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_exp2_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_exp2_native.h new file mode 100644 index 0000000000000000000000000000000000000000..6176e342151b38208607c276c4b9e138718d17bd --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_exp2_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor special_exp2(const at::Tensor & self); +TORCH_API at::Tensor & special_exp2_out(const at::Tensor & self, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_legendre_polynomial_p_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_legendre_polynomial_p_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7e46187f5d93dc54699a29e0fe94d99eea7397cc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_legendre_polynomial_p_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_legendre_polynomial_p(const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_legendre_polynomial_p_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_legendre_polynomial_p_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_i0.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_i0.h new file mode 100644 index 0000000000000000000000000000000000000000..e0f2c6d9372b10b5b405f593989ef140c40c6219 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_i0.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_modified_bessel_i0(Tensor self) -> Tensor +inline at::Tensor special_modified_bessel_i0(const at::Tensor & self) { + return at::_ops::special_modified_bessel_i0::call(self); +} + +// aten::special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_modified_bessel_i0_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::special_modified_bessel_i0_out::call(self, out); +} +// aten::special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_modified_bessel_i0_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::special_modified_bessel_i0_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k0_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k0_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..96f8c2be0776afd38c7b00d8f65108aa2378a869 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k0_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_modified_bessel_k0(const at::Tensor & self); +TORCH_API at::Tensor & special_modified_bessel_k0_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_modified_bessel_k0_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_multigammaln_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_multigammaln_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f5d8465b1e53f2ba2cc91358a52a774c76c57c69 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_multigammaln_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_multigammaln(const at::Tensor & self, int64_t p); +TORCH_API at::Tensor & special_multigammaln_out(at::Tensor & out, const at::Tensor & self, int64_t p); +TORCH_API at::Tensor & special_multigammaln_outf(const at::Tensor & self, int64_t p, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_ndtri_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_ndtri_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ec3c35b9e42a558ce480834879c955e672214292 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_ndtri_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_ndtri(const at::Tensor & self); +TORCH_API at::Tensor & special_ndtri_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_ndtri_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_polygamma_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_polygamma_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..553246cf44eb0d2f998d6b5d7e3ca97b03bf5bed --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_polygamma_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_polygamma(int64_t n, const at::Tensor & self); +TORCH_API at::Tensor & special_polygamma_out(at::Tensor & out, int64_t n, const at::Tensor & self); +TORCH_API at::Tensor & special_polygamma_outf(int64_t n, const at::Tensor & self, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_psi.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_psi.h new file mode 100644 index 0000000000000000000000000000000000000000..6816ac1db3f471f0e726848df575f70a3587110c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_psi.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_psi(Tensor self) -> Tensor +inline at::Tensor special_psi(const at::Tensor & self) { + return at::_ops::special_psi::call(self); +} + +// aten::special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_psi_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::special_psi_out::call(self, out); +} +// aten::special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_psi_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::special_psi_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_round_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_round_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fc512ab7b9e1115802178f5599407c4c995f8148 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_round_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_round { + using schema = at::Tensor (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_round"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_round(Tensor self, *, int decimals=0) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t decimals); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t decimals); +}; + +struct TORCH_API special_round_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_round"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_round.out(Tensor self, *, int decimals=0, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, int64_t decimals, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t decimals, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t.h new file mode 100644 index 0000000000000000000000000000000000000000..df138b2ccf38040229fa65f580b371a16e97e4f2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t.h @@ -0,0 +1,73 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor +inline at::Tensor special_shifted_chebyshev_polynomial_t(const at::Tensor & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t::call(x, n); +} + +// aten::special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor +inline at::Tensor special_shifted_chebyshev_polynomial_t(const at::Scalar & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_x_scalar::call(x, n); +} + +// aten::special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor +inline at::Tensor special_shifted_chebyshev_polynomial_t(const at::Tensor & x, const at::Scalar & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_n_scalar::call(x, n); +} + +// aten::special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_out::call(x, n, out); +} +// aten::special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out) { + return at::_ops::special_shifted_chebyshev_polynomial_t_out::call(x, n, out); +} + +// aten::special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_out(at::Tensor & out, const at::Scalar & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_x_scalar_out::call(x, n, out); +} +// aten::special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_outf(const at::Scalar & x, const at::Tensor & n, at::Tensor & out) { + return at::_ops::special_shifted_chebyshev_polynomial_t_x_scalar_out::call(x, n, out); +} + +// aten::special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_out(at::Tensor & out, const at::Tensor & x, const at::Scalar & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_n_scalar_out::call(x, n, out); +} +// aten::special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_outf(const at::Tensor & x, const at::Scalar & n, at::Tensor & out) { + return at::_ops::special_shifted_chebyshev_polynomial_t_n_scalar_out::call(x, n, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..6ebc3c2bb86793c461aea37812bb4fedef394b0c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_special_shifted_chebyshev_polynomial_t : public TensorIteratorBase { + + + void meta(const at::Tensor & x, const at::Tensor & n); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_sinc.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_sinc.h new file mode 100644 index 0000000000000000000000000000000000000000..4f15a5d8b45211e41a5e4554c57ce9bf27e3fc8e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_sinc.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_sinc(Tensor self) -> Tensor +inline at::Tensor special_sinc(const at::Tensor & self) { + return at::_ops::special_sinc::call(self); +} + +// aten::special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_sinc_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::special_sinc_out::call(self, out); +} +// aten::special_sinc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_sinc_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::special_sinc_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_spherical_bessel_j0_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_spherical_bessel_j0_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..afb0cf88861c17a25b3621f190e93027f1963e5a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_spherical_bessel_j0_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_spherical_bessel_j0 { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_spherical_bessel_j0"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_spherical_bessel_j0(Tensor x) -> Tensor"; + static at::Tensor call(const at::Tensor & x); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x); +}; + +struct TORCH_API special_spherical_bessel_j0_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_spherical_bessel_j0"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_spherical_bessel_j0.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & x, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e2b4d02ac8a17701d0bd05c0b86d72ebb7a6d4c8 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor special_xlog1py(const at::Tensor & self, const at::Tensor & other); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..85adf4c7eecf18f68748b2090b6b1f8da70fb200 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_xlog1py(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & special_xlog1py_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & special_xlog1py_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..ee990de61e862c9297ccfb8979c9790c6c28deb3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_special_xlog1py : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..566e816177a3e938c4dccc17c092bf57cb52f199 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_xlog1py { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_xlog1py(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API special_xlog1py_self_scalar { + using schema = at::Tensor (const at::Scalar &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "self_scalar"; + static constexpr const char* schema_str = "special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Scalar & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other); +}; + +struct TORCH_API special_xlog1py_other_scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "other_scalar"; + static constexpr const char* schema_str = "special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API special_xlog1py_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API special_xlog1py_self_scalar_out { + using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "self_scalar_out"; + static constexpr const char* schema_str = "special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Scalar & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API special_xlog1py_other_scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "other_scalar_out"; + static constexpr const char* schema_str = "special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/split.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/split.h new file mode 100644 index 0000000000000000000000000000000000000000..35a3e1545056ec3f35eee4a8e0be1c3877bacf71 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/split.h @@ -0,0 +1,75 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[] +inline ::std::vector split(const at::Tensor & self, int64_t split_size, int64_t dim=0) { + return at::_ops::split_Tensor::call(self, split_size, dim); +} +namespace symint { + template >> + ::std::vector split(const at::Tensor & self, int64_t split_size, int64_t dim=0) { + return at::_ops::split_Tensor::call(self, split_size, dim); + } +} + +// aten::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[] +inline ::std::vector split_symint(const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) { + return at::_ops::split_Tensor::call(self, split_size, dim); +} +namespace symint { + template >> + ::std::vector split(const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) { + return at::_ops::split_Tensor::call(self, split_size, dim); + } +} + +// aten::split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[] +inline ::std::vector split(const at::Tensor & self, at::IntArrayRef split_size, int64_t dim=0) { + return at::_ops::split_sizes::call(self, c10::fromIntArrayRefSlow(split_size), dim); +} +namespace symint { + template >> + ::std::vector split(const at::Tensor & self, at::IntArrayRef split_size, int64_t dim=0) { + return at::_ops::split_sizes::call(self, c10::fromIntArrayRefSlow(split_size), dim); + } +} + +// aten::split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[] +inline ::std::vector split_symint(const at::Tensor & self, c10::SymIntArrayRef split_size, int64_t dim=0) { + return at::_ops::split_sizes::call(self, split_size, dim); +} +namespace symint { + template >> + ::std::vector split(const at::Tensor & self, c10::SymIntArrayRef split_size, int64_t dim=0) { + return at::_ops::split_sizes::call(self, split_size, dim); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sspaddmm_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sspaddmm_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2dd623bd69e6b23b8f592e0cf2c5d20e8f664472 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sspaddmm_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor sspaddmm(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sspaddmm_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sspaddmm_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..263681a3f9de22e97f197f2199a70785d7ce24d2 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sspaddmm_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & sspaddmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & sspaddmm_outf(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/std_mean_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/std_mean_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..23d0e36e0dbb1a4b388ccff3db1e8317d60d1027 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/std_mean_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple std_mean(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, const ::std::optional & correction=::std::nullopt, bool keepdim=false); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/stride_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/stride_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..09f6647b2061ca50b0b45bf18dc08f7b009d00c0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/stride_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API stride_int { + using schema = int64_t (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::stride"; + static constexpr const char* overload_name = "int"; + static constexpr const char* schema_str = "stride.int(Tensor self, int dim) -> int"; + static int64_t call(const at::Tensor & self, int64_t dim); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim); +}; + +struct TORCH_API stride_Dimname { + using schema = int64_t (const at::Tensor &, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::stride"; + static constexpr const char* overload_name = "Dimname"; + static constexpr const char* schema_str = "stride.Dimname(Tensor self, Dimname dim) -> int"; + static int64_t call(const at::Tensor & self, at::Dimname dim); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/subtract_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/subtract_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..53cdd0cfd4690bd431c7d27be698dd61a1589fb5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/subtract_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor subtract(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & subtract_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & subtract_outf(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & subtract_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor subtract(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & subtract_(at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/swapaxes_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/swapaxes_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..eddc58e24bd97529ef1d792c6856e30f1444514e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/swapaxes_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor swapaxes(const at::Tensor & self, int64_t axis0, int64_t axis1); +TORCH_API at::Tensor & swapaxes_(at::Tensor & self, int64_t axis0, int64_t axis1); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d1faa0034f3f4f32a9824dd16af45963727e5ac0 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API void sym_constrain_range(const at::Scalar & size, ::std::optional min=::std::nullopt, ::std::optional max=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/t_copy_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/t_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ff1e05e2ee15c52fef9487389663b301c6579d37 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/t_copy_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & t_copy_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor t_copy(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/take_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/take_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..5cb77898a4f0272b7abba27c0ae4fab9431e5f26 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/take_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API take_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::take"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & index, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & index, at::Tensor & out); +}; + +struct TORCH_API take { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::take"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "take(Tensor self, Tensor index) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & index); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & index); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tan_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tan_native.h new file mode 100644 index 0000000000000000000000000000000000000000..18ca1873c08ae2f596b60ab1d2c6ad363def4016 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tan_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_tan_out : public at::meta::structured_tan { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +TORCH_API at::Tensor tan_sparse(const at::Tensor & self); +TORCH_API at::Tensor & tan_sparse_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & tan_sparse_(at::Tensor & self); +TORCH_API at::Tensor tan_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & tan_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & tan_sparse_csr_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tensor_split_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tensor_split_native.h new file mode 100644 index 0000000000000000000000000000000000000000..267bfbc5f3ddf35709da8aeb8395760bd4cbb1f5 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tensor_split_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector tensor_split_sections_symint(const at::Tensor & self, c10::SymInt sections, int64_t dim=0); +TORCH_API ::std::vector tensor_split_indices_symint(const at::Tensor & self, c10::SymIntArrayRef indices, int64_t dim=0); +TORCH_API ::std::vector tensor_split(const at::Tensor & self, const at::Tensor & tensor_indices_or_sections, int64_t dim=0); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..8850f544a7aa467758f690a523bdf1d6e8df21b3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_threshold_backward : public TensorIteratorBase { + + + void meta(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..51258c7b71c2c26dbd210f41bc2b41c846a2c529 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API threshold_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::threshold_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold, at::Tensor & grad_input); +}; + +struct TORCH_API threshold_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::threshold_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..399a3df569824909413b5f958170fd7961131b05 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor threshold(const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value); +TORCH_API at::Tensor & threshold_(at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_dense_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_dense_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7cea797e5bc47cf60201c984cb8dc19c8db42e2b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_dense_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API to_dense { + using schema = at::Tensor (const at::Tensor &, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::to_dense"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, ::std::optional dtype, ::std::optional masked_grad); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional dtype, ::std::optional masked_grad); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_backward_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_backward_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fb8881aa3474cb6a2bdb9f6333a3a299713c0791 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_backward_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor to_mkldnn_backward(const at::Tensor & grad, const at::Tensor & input); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_padded_tensor_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_padded_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..78d1d3c0c4383dd5802fca3c0f502d286412049f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_padded_tensor_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & to_padded_tensor_out_symint(const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size, at::Tensor & out); +TORCH_API at::Tensor NestedTensor_to_padded_tensor_generic(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=::std::nullopt); +TORCH_API at::Tensor NestedTensor_to_padded_tensor_cuda(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_sparse_bsr_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_sparse_bsr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1e6effd9048bcdada1f677d0e40733fce74db9e3 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/to_sparse_bsr_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API to_sparse_bsr { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::to_sparse_bsr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/topk_cuda_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/topk_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e18c2984c2b94046761531a117e47ffcfe880902 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/topk_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple topk(const at::Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true); +TORCH_API ::std::tuple topk_symint(const at::Tensor & self, c10::SymInt k, int64_t dim=-1, bool largest=true, bool sorted=true); +TORCH_API ::std::tuple topk_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true); +TORCH_API ::std::tuple topk_outf(const at::Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted, at::Tensor & values, at::Tensor & indices); +TORCH_API ::std::tuple topk_symint_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, c10::SymInt k, int64_t dim=-1, bool largest=true, bool sorted=true); +TORCH_API ::std::tuple topk_symint_outf(const at::Tensor & self, c10::SymInt k, int64_t dim, bool largest, bool sorted, at::Tensor & values, at::Tensor & indices); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/topk_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/topk_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..af2d0b1a114cc94ea8644cad5d58b3325f302cdc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/topk_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_topk : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tril_indices_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tril_indices_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..700d2a825053107e5ca938c14a01af9ee35a0626 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tril_indices_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tril_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tril_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2ab2ea42973907e9ade1c88a00517446dd088a21 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tril_meta_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor tril(const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor tril_symint(const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & tril_out(at::Tensor & out, const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & tril_outf(const at::Tensor & self, int64_t diagonal, at::Tensor & out); +TORCH_API at::Tensor & tril_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & tril_symint_outf(const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out); +TORCH_API at::Tensor & tril_(at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & tril__symint(at::Tensor & self, c10::SymInt diagonal=0); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tril_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tril_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3fbf6401b983fbd12ee15fd82cf01b45f00b224c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/tril_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API tril_ { + using schema = at::Tensor & (at::Tensor &, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::tril_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "tril_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, c10::SymInt diagonal); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, c10::SymInt diagonal); +}; + +struct TORCH_API tril_out { + using schema = at::Tensor & (const at::Tensor &, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::tril"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "tril.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out); +}; + +struct TORCH_API tril { + using schema = at::Tensor (const at::Tensor &, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::tril"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "tril(Tensor self, SymInt diagonal=0) -> Tensor"; + static at::Tensor call(const at::Tensor & self, c10::SymInt diagonal); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt diagonal); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/triplet_margin_loss_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/triplet_margin_loss_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f53872f4a5bfab95e042e0043b8c9187945bc303 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/triplet_margin_loss_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor triplet_margin_loss(const at::Tensor & anchor, const at::Tensor & positive, const at::Tensor & negative, double margin=1.0, double p=2, double eps=1e-06, bool swap=false, int64_t reduction=at::Reduction::Mean); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/triu_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/triu_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cc80c28740c4c453f529af484efec605a7b9235b --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/triu_cpu_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor triu(const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor triu_symint(const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & triu_out(at::Tensor & out, const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & triu_outf(const at::Tensor & self, int64_t diagonal, at::Tensor & out); +TORCH_API at::Tensor & triu_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & triu_symint_outf(const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out); +TORCH_API at::Tensor & triu_(at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & triu__symint(at::Tensor & self, c10::SymInt diagonal=0); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices.h new file mode 100644 index 0000000000000000000000000000000000000000..5e05596a04796a6495d5fcb5f2e6bce50b356823 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong) { + return at::_ops::triu_indices::call(row, col, offset, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::triu_indices::call(row, col, offset, dtype, layout, device, pin_memory); +} + +// aten::triu_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & triu_indices_out(at::Tensor & out, int64_t row, int64_t col, int64_t offset=0) { + return at::_ops::triu_indices_out::call(row, col, offset, out); +} +// aten::triu_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & triu_indices_outf(int64_t row, int64_t col, int64_t offset, at::Tensor & out) { + return at::_ops::triu_indices_out::call(row, col, offset, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/true_divide_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/true_divide_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4c2a6f786014abebc1b4ec403dd0720f80801f5f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/true_divide_ops.h @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API true_divide_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "true_divide.Tensor(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API true_divide__Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide_"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API true_divide_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API true_divide_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "true_divide.Scalar(Tensor self, Scalar other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API true_divide__Scalar { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::true_divide_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & other); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unbind_copy_compositeexplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unbind_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..779fcb95ff255e3b457a9dc092a1f7468dcc0294 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unbind_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API void unbind_copy_out(at::TensorList out, const at::Tensor & self, int64_t dim=0); +TORCH_API void unbind_copy_outf(const at::Tensor & self, int64_t dim, at::TensorList out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors.h new file mode 100644 index 0000000000000000000000000000000000000000..ef1fa5feb1eaaaf1d1f908c2d6a852997180d29c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[] +inline ::std::vector unflatten_dense_tensors(const at::Tensor & flat, at::TensorList tensors) { + return at::_ops::unflatten_dense_tensors::call(flat, tensors); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..339572b2e78c99a3d1660f99949e06a802c3bd6c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API unflatten_dense_tensors { + using schema = ::std::vector (const at::Tensor &, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::unflatten_dense_tensors"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]"; + static ::std::vector call(const at::Tensor & flat, at::TensorList tensors); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & flat, at::TensorList tensors); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_backward_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d0379fcfa0d812120afa20bbd52453af91ac58e4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_backward_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor unfold_backward(const at::Tensor & grad_in, at::IntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step); +TORCH_API at::Tensor unfold_backward_symint(const at::Tensor & grad_in, c10::SymIntArrayRef input_sizes, int64_t dim, int64_t size, int64_t step); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bc67d28ed4dab093dd813b25d332783bd35bd793 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API unfold { + using schema = at::Tensor (const at::Tensor &, int64_t, int64_t, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::unfold"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d33ffa20727f7847813886dd89e13497b796ecb1 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API uniform_ { + using schema = at::Tensor & (at::Tensor &, double, double, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::uniform_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, double from, double to, ::std::optional generator); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double from, double to, ::std::optional generator); +}; + +struct TORCH_API uniform_out { + using schema = at::Tensor & (const at::Tensor &, double, double, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::uniform"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "uniform.out(Tensor self, float from=0, float to=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, double from, double to, ::std::optional generator, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double from, double to, ::std::optional generator, at::Tensor & out); +}; + +struct TORCH_API uniform { + using schema = at::Tensor (const at::Tensor &, double, double, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::uniform"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "uniform(Tensor self, float from=0, float to=1, *, Generator? generator=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, double from, double to, ::std::optional generator); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double from, double to, ::std::optional generator); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unique_dim_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unique_dim_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ae3167dbb12abf13532dd80f7b89f2ef9f73a61f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/unique_dim_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple unique_dim_out(const at::Tensor & self, int64_t dim, bool sorted, bool return_inverse, bool return_counts, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); +TORCH_API ::std::tuple unique_dim_cpu(const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false); +TORCH_API ::std::tuple unique_dim_cuda(const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..999d76e19fbf1d46452cdf7f76c9aa4a04c7cae9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_backward.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & upsample_bilinear2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::upsample_bilinear2d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input); +} +namespace symint { + template >> + at::Tensor & upsample_bilinear2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::upsample_bilinear2d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input); + } +} + +// aten::upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & upsample_bilinear2d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input) { + return at::_ops::upsample_bilinear2d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input); +} +namespace symint { + template >> + at::Tensor & upsample_bilinear2d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input) { + return at::_ops::upsample_bilinear2d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input); + } +} + +// aten::upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & upsample_bilinear2d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::upsample_bilinear2d_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input); +} +namespace symint { + template >> + at::Tensor & upsample_bilinear2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::upsample_bilinear2d_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input); + } +} + +// aten::upsample_bilinear2d_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & upsample_bilinear2d_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input) { + return at::_ops::upsample_bilinear2d_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input); +} +namespace symint { + template >> + at::Tensor & upsample_bilinear2d_backward_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input) { + return at::_ops::upsample_bilinear2d_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input); + } +} + +// aten::upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor +inline at::Tensor upsample_bilinear2d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::upsample_bilinear2d_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w); +} +namespace symint { + template >> + at::Tensor upsample_bilinear2d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::upsample_bilinear2d_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w); + } +} + +// aten::upsample_bilinear2d_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor +inline at::Tensor upsample_bilinear2d_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::upsample_bilinear2d_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w); +} +namespace symint { + template >> + at::Tensor upsample_bilinear2d_backward(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::upsample_bilinear2d_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_backward_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..a3c668a4752ea18b1e04515a17bdef5dfe026585 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_backward_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_upsample_bilinear2d_backward_out_cpu : public at::meta::structured_upsample_bilinear2d_backward { +void impl(const at::Tensor & grad_output, at::ArrayRef output_size, at::ArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, const at::Tensor & grad_input); +}; +struct TORCH_API structured_upsample_bilinear2d_backward_out_cuda : public at::meta::structured_upsample_bilinear2d_backward { +void impl(const at::Tensor & grad_output, at::ArrayRef output_size, at::ArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, const at::Tensor & grad_input); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..26131cf94bb7acb6b525a6cb3fd3a6d548411e3a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor upsample_bilinear2d(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor upsample_bilinear2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest1d_backward.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest1d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..01c552f55bf44b240088568f4bfd941a03d65a0f --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest1d_backward.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & upsample_nearest1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::upsample_nearest1d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input); +} +namespace symint { + template >> + at::Tensor & upsample_nearest1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::upsample_nearest1d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input); + } +} + +// aten::upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & upsample_nearest1d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales, at::Tensor & grad_input) { + return at::_ops::upsample_nearest1d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input); +} +namespace symint { + template >> + at::Tensor & upsample_nearest1d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales, at::Tensor & grad_input) { + return at::_ops::upsample_nearest1d_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales, grad_input); + } +} + +// aten::upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & upsample_nearest1d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::upsample_nearest1d_backward_grad_input::call(grad_output, output_size, input_size, scales, grad_input); +} +namespace symint { + template >> + at::Tensor & upsample_nearest1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::upsample_nearest1d_backward_grad_input::call(grad_output, output_size, input_size, scales, grad_input); + } +} + +// aten::upsample_nearest1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & upsample_nearest1d_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales, at::Tensor & grad_input) { + return at::_ops::upsample_nearest1d_backward_grad_input::call(grad_output, output_size, input_size, scales, grad_input); +} +namespace symint { + template >> + at::Tensor & upsample_nearest1d_backward_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales, at::Tensor & grad_input) { + return at::_ops::upsample_nearest1d_backward_grad_input::call(grad_output, output_size, input_size, scales, grad_input); + } +} + +// aten::upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor +inline at::Tensor upsample_nearest1d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::upsample_nearest1d_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales); +} +namespace symint { + template >> + at::Tensor upsample_nearest1d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::upsample_nearest1d_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), scales); + } +} + +// aten::upsample_nearest1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, float? scales=None) -> Tensor +inline at::Tensor upsample_nearest1d_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::upsample_nearest1d_backward::call(grad_output, output_size, input_size, scales); +} +namespace symint { + template >> + at::Tensor upsample_nearest1d_backward(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales=::std::nullopt) { + return at::_ops::upsample_nearest1d_backward::call(grad_output, output_size, input_size, scales); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_cpu_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..709673dda5f6df65317d88e46f51a44a1b75ef50 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor upsample_nearest3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor upsample_nearest3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_nearest3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_nearest3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & upsample_nearest3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_nearest3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_meta.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..ba56efd75b79e00d71c900afd57e0f2b37548b38 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_upsample_nearest3d : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, at::ArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/vander_compositeimplicitautograd_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/vander_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c02ca370e0cf59b5ee18194de4e8f09cee87f502 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/vander_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor vander(const at::Tensor & x, ::std::optional N=::std::nullopt, bool increasing=false); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1e6034883e3b059b201aeb2f1831c98518b0fe59 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor view_as(const at::Tensor & self, const at::Tensor & other); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_real_copy_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_real_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7d46b7fc87cae03114007d9dc5fc242e9a74eed9 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_real_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API view_as_real_copy { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::view_as_real_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "view_as_real_copy(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API view_as_real_copy_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::view_as_real_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "view_as_real_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/vsplit.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/vsplit.h new file mode 100644 index 0000000000000000000000000000000000000000..54a26e4e3da828e7334c5897e1882cef17a6d89a --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/vsplit.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[] +inline ::std::vector vsplit(const at::Tensor & self, int64_t sections) { + return at::_ops::vsplit_int::call(self, sections); +} + +// aten::vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[] +inline ::std::vector vsplit(const at::Tensor & self, at::IntArrayRef indices) { + return at::_ops::vsplit_array::call(self, indices); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/vsplit_ops.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/vsplit_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..81f818d759255e5d82f720eea582dda91058f1cc --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/vsplit_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API vsplit_int { + using schema = ::std::vector (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::vsplit"; + static constexpr const char* overload_name = "int"; + static constexpr const char* schema_str = "vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]"; + static ::std::vector call(const at::Tensor & self, int64_t sections); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sections); +}; + +struct TORCH_API vsplit_array { + using schema = ::std::vector (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::vsplit"; + static constexpr const char* overload_name = "array"; + static constexpr const char* schema_str = "vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]"; + static ::std::vector call(const at::Tensor & self, at::IntArrayRef indices); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef indices); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/xlogy_meta_dispatch.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/xlogy_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..735be7dfd0441a52e2c5a78b67c241748666baca --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/xlogy_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor xlogy(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & xlogy_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & xlogy_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & xlogy_(at::Tensor & self, const at::Tensor & other); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/zero_native.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/zero_native.h new file mode 100644 index 0000000000000000000000000000000000000000..388a6963d0ed77e0632b546e17fba69057ec4491 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ops/zero_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor zero(const at::Tensor & self); +TORCH_API at::Tensor & zero_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & zero_(at::Tensor & self); +TORCH_API at::Tensor & zero_meta_(at::Tensor & self); +TORCH_API at::Tensor & zero_nested_(at::Tensor & self); +TORCH_API at::Tensor & zero_sparse_(at::Tensor & self); +TORCH_API at::Tensor & zero_sparse_csr_(at::Tensor & self); +TORCH_API at::Tensor & mkldnn_zero_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/CachingHostAllocator.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/CachingHostAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..c153824e0607ef92b0828c1a670ad5d7644d2c9c --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/CachingHostAllocator.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace at::xpu { + +C10_DEPRECATED_MESSAGE( + "at::xpu::getCachingHostAllocator() is deprecated. Please use at::getHostAllocator(at::kXPU) instead.") +inline TORCH_XPU_API at::HostAllocator* getCachingHostAllocator() { + return at::getHostAllocator(at::kXPU); +} + +C10_DEPRECATED_MESSAGE( + "at::xpu::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kXPU)->record_event(...) instead.") +inline TORCH_XPU_API bool CachingHostAllocator_recordEvent( + void* ptr, + void* ctx, + c10::xpu::XPUStream stream) { + return getHostAllocator(at::kXPU)->record_event(ptr, ctx, stream.unwrap()); +} + +C10_DEPRECATED_MESSAGE( + "at::xpu::CachingHostAllocator_emptyCache() is deprecated. Please use at::getHostAllocator(at::kXPU)->empty_cache() instead.") +inline TORCH_XPU_API void CachingHostAllocator_emptyCache() { + getHostAllocator(at::kXPU)->empty_cache(); +} + +C10_DEPRECATED_MESSAGE( + "at::xpu::HostAlloc(...) is deprecated. Please use at::getHostAllocator(at::kXPU)->allocate(...) instead.") +inline TORCH_XPU_API at::DataPtr HostAlloc(size_t size) { + return getHostAllocator(at::kXPU)->allocate(size); +} + +} // namespace at::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/PeerToPeerAccess.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/PeerToPeerAccess.h new file mode 100644 index 0000000000000000000000000000000000000000..a807cd1ffc18d6f2d4248e9a525457b71fcb70ee --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/PeerToPeerAccess.h @@ -0,0 +1,20 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::xpu { +namespace detail { +void init_p2p_access_cache(c10::DeviceIndex num_devices); +} // namespace detail + +TORCH_XPU_API bool get_p2p_access( + c10::DeviceIndex dev, + c10::DeviceIndex dev_to_access); + +} // namespace at::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/PhiloxXpuState.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/PhiloxXpuState.h new file mode 100644 index 0000000000000000000000000000000000000000..f3f602fe3dd5a6c3ebb9d20f8e51a4f246679977 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/PhiloxXpuState.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at { + +struct PhiloxXpuState { + PhiloxXpuState() = default; + PhiloxXpuState(uint64_t seed, uint64_t offset) { + seed_.val = seed; + offset_.val = offset; + } + // for graph capture + PhiloxXpuState( + int64_t* seed, + int64_t* offset_extragraph, + uint32_t offset_intragraph) { + seed_.ptr = seed; + offset_.ptr = offset_extragraph; + offset_intragraph_ = offset_intragraph; + captured_ = true; + } + + union Payload { + uint64_t val; + int64_t* ptr; + }; + + Payload seed_{}; + Payload offset_{}; + uint32_t offset_intragraph_ = 0; + bool captured_ = false; +}; + +namespace xpu::philox { +inline std::tuple unpack(at::PhiloxXpuState arg) { + if (arg.captured_) { + return std::make_tuple( + static_cast(*arg.seed_.ptr), + static_cast(*(arg.offset_.ptr) + arg.offset_intragraph_)); + } else { + return std::make_tuple(arg.seed_.val, arg.offset_.val); + } +} + +} // namespace xpu::philox +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/PinnedMemoryAllocator.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/PinnedMemoryAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..0ef4066089c40bbc28001edf111209932b995864 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/PinnedMemoryAllocator.h @@ -0,0 +1,16 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::xpu { + +inline TORCH_XPU_API at::HostAllocator* getPinnedMemoryAllocator() { + return at::getHostAllocator(at::kXPU); +} +} // namespace at::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUContext.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUContext.h new file mode 100644 index 0000000000000000000000000000000000000000..049b4f68267552a98ce8e07210dd7d3aa8e80e91 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUContext.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::xpu { + +// XPU is available if we compiled with XPU. +inline bool is_available() { + return c10::xpu::device_count() > 0; +} + +TORCH_XPU_API DeviceProp* getCurrentDeviceProperties(); + +TORCH_XPU_API DeviceProp* getDeviceProperties(DeviceIndex device); + +TORCH_XPU_API int32_t getGlobalIdxFromDevice(DeviceIndex device); + +TORCH_XPU_API bool canDeviceAccessPeer(DeviceIndex device, DeviceIndex peer); + +} // namespace at::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUDevice.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUDevice.h new file mode 100644 index 0000000000000000000000000000000000000000..63b56c86c6ed26d2877eb4534dd831007830dbb4 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUDevice.h @@ -0,0 +1,18 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::xpu { + +inline Device getDeviceFromPtr(void* ptr) { + auto device = c10::xpu::get_device_idx_from_pointer(ptr); + return {c10::DeviceType::XPU, device}; +} + +} // namespace at::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUEvent.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUEvent.h new file mode 100644 index 0000000000000000000000000000000000000000..be5c5b83169f0a632d913e08b161ab19bafb6421 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUEvent.h @@ -0,0 +1,8 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUGeneratorImpl.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUGeneratorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..15567d178f5848214055d9e9df6411ced16a3a5e --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUGeneratorImpl.h @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at { + +namespace xpu { +struct XPUGraph; +} + +struct XPUGeneratorState : public c10::intrusive_ptr_target { + uint64_t seed_; + uint64_t philox_offset_per_thread_; + uint32_t offset_intragraph_; + bool capturing_{}; + at::TensorBase seed_extragraph_{}; + at::TensorBase offset_extragraph_{}; + + XPUGeneratorState( + uint64_t seed = default_rng_seed_val, + uint64_t philox_offset_per_thread = 0, + uint32_t offset_intragraph = 0) + : seed_(seed), + philox_offset_per_thread_(philox_offset_per_thread), + offset_intragraph_(offset_intragraph) {} + + void increase(uint64_t increment); + + c10::intrusive_ptr clone(); +}; + +struct TORCH_XPU_API XPUGeneratorImpl : public GeneratorImpl { + // Constructors + XPUGeneratorImpl(DeviceIndex device_index = -1); + XPUGeneratorImpl( + DeviceIndex device_index, + c10::intrusive_ptr state_); + ~XPUGeneratorImpl() override = default; + + // XPUGeneratorImpl methods + std::shared_ptr clone() const; + void set_current_seed(uint64_t seed) override; + void set_offset(uint64_t offset) override; + uint64_t get_offset() const override; + uint64_t current_seed() const override; + uint64_t seed() override; + void set_state(const c10::TensorImpl& new_state) override; + c10::intrusive_ptr get_state() const override; + + void set_philox_offset_per_thread(uint64_t offset); + uint64_t philox_offset_per_thread() const; + + PhiloxXpuState philox_xpu_state(uint64_t increment); + // will remove once all ops are refactored to use philox_xpu_state. + std::pair philox_engine_inputs(uint64_t increment); + static c10::DeviceType device_type(); + + private: + XPUGeneratorImpl* clone_impl() const override; + c10::intrusive_ptr state_; +}; + +namespace xpu::detail { + +TORCH_XPU_API const Generator& getDefaultXPUGenerator(DeviceIndex device = -1); + +TORCH_XPU_API Generator createXPUGenerator(DeviceIndex device = -1); + +} // namespace xpu::detail +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUGraphsUtils.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUGraphsUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..8b61e894d54d97ee140049b356477a82d38fd6b7 --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUGraphsUtils.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::xpu { + +inline CaptureStatus currentStreamCaptureStatus() { + return c10::xpu::currentStreamCaptureStatusMayInitCtx(); +} + +inline void assertNotCapturing(const std::string& attempt) { + auto status = currentStreamCaptureStatus(); + TORCH_CHECK( + status == CaptureStatus::Executing, + attempt, + " during XPU graph capture. If you need this call to be captured, " + "please file an issue. " + "Current xpuStreamCaptureStatus: ", + status); +} + +} // namespace at::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUScaledBlas.h b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUScaledBlas.h new file mode 100644 index 0000000000000000000000000000000000000000..883e7642d968186b4a006c74e9ac558c2d3557ce --- /dev/null +++ b/URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/xpu/XPUScaledBlas.h @@ -0,0 +1,100 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include +#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef USE_FBGEMM_GENAI +#include +#endif + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +using at::blas::ScalingType; + +namespace at::native::onednn::scaled { + +/** + * Track concrete implementations available + */ +enum class ScaledGemmImplementation { + NONE = 0, + TENSORWISE_TENSORWISE = 1, + ROWWISE_ROWWISE = 2, +}; + +/** + * Convert passed int (enum) from python back into a + * strictly-typed enum + */ +template +std::vector convert_int_to_enum(ArrayType& v) { + std::vector converted; + converted.reserve(v.size()); + + for (auto vi : v) { + converted.push_back(static_cast(vi)); + } + return converted; +} + +bool check_tensorwise_recipe( + c10::ScalarType, + std::vector&, + ArrayRef&, + c10::ScalarType, + std::vector&, + ArrayRef&); + +bool check_rowwise_recipe( + c10::ScalarType, + std::vector&, + ArrayRef&, + c10::ScalarType, + std::vector&, + ArrayRef&); + +} // namespace at::native::onednn::scaled + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)