BryanW commited on Mar 23

Commit

3f10421

verified ·

1 Parent(s): fd876a9

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h +218 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h +111 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h +346 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h +395 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h +32 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h +43 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h +46 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h +415 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h +790 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h +145 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h +72 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h +285 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h +955 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h +22 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h +342 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h +35 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h +41 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h +86 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h +162 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h +186 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h +599 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/FlushDenormal.h +19 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/Utils.h +38 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/vml.h +175 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/BLASConstants.h +16 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h +76 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h +156 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh +41 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh +129 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h +42 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h +16 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh +141 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh +48 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh +121 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh +39 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h +705 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h +692 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h +282 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h +55 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h +270 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h +334 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h +434 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/ADInterpreters.h +43 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchRulesHelper.h +486 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedFallback.h +86 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h +181 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchingMetaprogramming.h +131 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/DynamicLayer.h +129 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h +27 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/Interpreter.h +358 -0

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h ADDED Viewed

	@@ -0,0 +1,218 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/boxing/OperatorKernel.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/intrusive_ptr.h>
+namespace c10 {
+struct IValue;
+using Stack = std::vector<IValue>;
+class OperatorHandle;
+class KernelFunction;
+// This kernel implements the behavior of falling through to the next available
+// registered dispatch key.  The implementation of this function is FAST; it is
+// no overhead to fallthrough to the next key.  See cpp file for some more
+// implementation notes; notably, this does NOT actually go through the
+// boxing/unboxing codepath.
+TORCH_API void fallthrough_kernel(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*unused*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);
+// Note [Ambiguity in AutogradOther kernel]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This error-reporting kernel is registered to the AutogradOther entry in the
+// dispatch table when there is both a CompositeImplicitAutograd kernel and a
+// backend kernel for ANY backend that maps to AutogradOther.  To see why
+// this is necessary in the AutogradOther case, it's helpful to first see
+// why everything works out fine for a backend that has a reserved Autograd
+// entry (see rule 2.2 in [Note] DispatchTable computation):
+//
+//    CPU   AutogradCPU
+//    reg?  registers with...
+//    -------------------------------------------------
+//    y     Autograd registration takes precedence
+//          over CompositeImplicitAutograd.
+//          This is good, because the CPU specific backend
+//          implementation is more specialized and typically better;
+//          if we used the composite, we would bypass it.
+//          (NB: the Autograd key is guaranteed to exist because
+//          the autograd codegen requires it!)
+//
+//    n     CompositeImplicitAutograd takes precedence.
+//          This is also good, because the Autograd
+//          registration (if it exists) would try to redispatch
+//          to the (non-existent) CPU implementation; by
+//          using the composite, we ensure the operator
+//          actually works.
+//
+// As you can see, when we have a specific Autograd key (AutogradCPU), we can
+// decide whether or not to use the CompositeImplicitAutograd kernel or the
+// Autograd kernel based on whether or not the backend kernel exists.
+//
+// However, for AutogradOther (which is the catchall autograd kernel for
+// everything that doesn't have a specific Autograd key), we can't do this
+// trick because there isn't any unique backend to peek at to disambiguate;
+// if there are some backends that have implementations they prefer Autograd,
+// but unimplemented backends would prefer CompositeImplicitAutograd.  Rather
+// than arbitrarily pick one or the other, we just register a kernel that raises
+// an error and let the user decide how to proceed.
+TORCH_API void ambiguous_autogradother_kernel(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);
+// Note [named_not_supported_kernel]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This kernel implements reporting an error message saying that named tensor is
+// not supported.  This kernel doesn't rely on the Stack, and so it is special
+// cased in the dispatcher to be triggered before we attempt boxing (so we can
+// give a good error message in cases when boxing is not supported).  When
+// boxing is universally supported this can be removed.
+[[noreturn]] TORCH_API void named_not_supported_kernel(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& /*op*/,
+    DispatchKeySet /*unused*/,
+    Stack* /*unused*/);
+/**
+ * BoxedKernel is similar to a std::function storing a boxed kernel.
+ */
+class TORCH_API BoxedKernel final {
+ public:
+  // This is how boxed kernels are actually stored
+  //
+  // Note [Plumbing Keys Through The Dispatcher]
+  // Benchmarks have shown that it is expensive for the dispatcher to read from
+  // thread-local storage (TLS) upon every dispatch call into order to compute
+  // which kernel to dispatch to.
+  //
+  // To mitigate this, we've updated the calling convention inside the
+  // dispatcher to expect every kernel that it stores to have a first argument
+  // of type DispatchKeySet.
+  //
+  // What are the invariants of the DispatchKeySet when it gets passed to a
+  // kernel?
+  // - All keys to the left of the current dispatch key have been masked out.
+  //   (e.g. a Tracing kernel that takes in the DispatchKeySet will expect the
+  //   highest bit to be DispatchKey::Tracer)
+  // - All other keys that dispatcher normally would have computed through TLS +
+  // global state + op arguments
+  //   are still in the set.
+  //
+  // Kernels can then opt into using this keyset to save the dispatcher from
+  // doing repeated work during redispatches: recalculating the highest-priority
+  // dispatch key, which involves reading from TLS. Instead, the kernels that
+  // opt in will calculate an updated DispatchKeySet directly from the old one,
+  // and pass the updated set directly into the dispatcher upon redispatching.
+  //
+  // This is an opt-in mechanism: Kernels can automatically opt in by setting
+  // the first argument in their signature to be of type DispatchKeySet. See the
+  // kernels in VariableTypeEverything.cpp and TraceTypeEverything.cpp for
+  // examples.
+  //
+  // The mechanism for optionally passing that DispatchKeySet into the kernel
+  // lives in make_boxed_from_unboxed_functor.h. See Note [Plumbing Keys Through
+  // The Dispatcher 2] for details.
+  using InternalBoxedKernelFunction =
+      void(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
+  // This is the public API for how boxed kernels are defined
+  using BoxedKernelFunction = void(const OperatorHandle&, Stack*);
+  using BoxedKernelFunction_withDispatchKeys =
+      void(const OperatorHandle&, DispatchKeySet, Stack*);
+  BoxedKernel();
+  // Fast path for dispatch to allow not touching the boxed kernel in
+  // the common case where unboxed is available.
+  bool isValid() const;
+  bool isFallthrough() const;
+  /**
+   * Call the function with boxed arguments.
+   */
+  void callBoxed(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+  /**
+   * Create a KernelFunction from a boxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > BoxedFunction func = BoxedKernel::makeFromFunction<&boxed_func>();
+   */
+  template <BoxedKernelFunction* func>
+  static BoxedKernel makeFromFunction();
+  /**
+   * TODO: This will only be useful if we write a backend fallback that plumbs
+   * dispatch keys (currently there are none) See Note [Plumbing Keys Through
+   * The Dispatcher] for details.
+   */
+  template <BoxedKernelFunction_withDispatchKeys* func>
+  static BoxedKernel makeFromFunction();
+  /**
+   * Create a KernelFunction from a boxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...}
+   * > };
+   * > BoxedKernel func =
+   * BoxedKernel::makeFromFunctor(std::make_unique<MyFunctor>());
+   */
+  template <class KernelFunctor>
+  static BoxedKernel makeFromFunctor(
+      std::unique_ptr<KernelFunctor> kernelFunctor);
+  static BoxedKernel makeFallthrough();
+  static BoxedKernel makeAmbiguousAutogradOther();
+  static BoxedKernel makeNamedNotSupported();
+ private:
+  friend class KernelFunction;
+  template <BoxedKernelFunction* func>
+  static void make_boxed_function(
+      OperatorKernel* /*unused*/,
+      const OperatorHandle& opHandle,
+      DispatchKeySet /*unused*/,
+      Stack* stack);
+  template <BoxedKernelFunction_withDispatchKeys* func>
+  static void make_boxed_function(
+      OperatorKernel* /*unused*/,
+      const OperatorHandle& opHandle,
+      DispatchKeySet /*ks*/,
+      Stack* stack);
+  explicit BoxedKernel(
+      std::unique_ptr<OperatorKernel> functor,
+      InternalBoxedKernelFunction* boxed_kernel_func);
+  OperatorKernel* getFunctor() const;
+  InternalBoxedKernelFunction* getFnPtr() const;
+  c10::intrusive_ptr<OperatorKernel> functor_;
+  InternalBoxedKernelFunction* boxed_kernel_func_;
+};
+} // namespace c10
+#include <ATen/core/boxing/BoxedKernel_impl.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h ADDED Viewed

	@@ -0,0 +1,111 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+namespace c10 {
+inline BoxedKernel::BoxedKernel() : boxed_kernel_func_(nullptr) {}
+inline BoxedKernel::BoxedKernel(
+    std::unique_ptr<OperatorKernel> functor,
+    InternalBoxedKernelFunction* boxed_kernel_func)
+    : functor_(std::move(functor)), boxed_kernel_func_(boxed_kernel_func) {}
+template <BoxedKernel::BoxedKernelFunction* func>
+inline void BoxedKernel::make_boxed_function(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& opHandle,
+    DispatchKeySet /*unused*/,
+    Stack* stack) {
+  // Note that we're dropping the DispatchKeySet argument.
+  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+  func(opHandle, stack);
+}
+template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+inline void BoxedKernel::make_boxed_function(
+    OperatorKernel* /*unused*/,
+    const OperatorHandle& opHandle,
+    DispatchKeySet ks,
+    Stack* stack) {
+  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+  func(opHandle, ks, stack);
+}
+inline bool BoxedKernel::isValid() const {
+  return boxed_kernel_func_ != nullptr;
+}
+inline bool BoxedKernel::isFallthrough() const {
+  return boxed_kernel_func_ == &fallthrough_kernel;
+}
+inline void BoxedKernel::callBoxed(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      boxed_kernel_func_ != nullptr,
+      "Tried to call BoxedKernel::callBoxed() on an uninitialized BoxedKernel.");
+  (*boxed_kernel_func_)(functor_.get(), opHandle, dispatchKeySet, stack);
+}
+template <BoxedKernel::BoxedKernelFunction* func>
+inline BoxedKernel BoxedKernel::makeFromFunction() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &make_boxed_function<func>);
+}
+template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+inline BoxedKernel BoxedKernel::makeFromFunction() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &make_boxed_function<func>);
+}
+inline BoxedKernel BoxedKernel::makeFallthrough() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &fallthrough_kernel);
+}
+inline BoxedKernel BoxedKernel::makeAmbiguousAutogradOther() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &ambiguous_autogradother_kernel);
+}
+inline BoxedKernel BoxedKernel::makeNamedNotSupported() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &named_not_supported_kernel);
+}
+template <class KernelFunctor>
+inline BoxedKernel BoxedKernel::makeFromFunctor(
+    std::unique_ptr<KernelFunctor> kernelFunctor) {
+  static_assert(
+      std::is_base_of_v<OperatorKernel, KernelFunctor>,
+      "Tried to call BoxedKernel::makeFromFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+  return BoxedKernel(
+      std::move(kernelFunctor),
+      [](OperatorKernel* kernel,
+         const OperatorHandle& op,
+         DispatchKeySet ks,
+         Stack* stack) {
+        (*static_cast<KernelFunctor*>(kernel))(op, ks, stack);
+      });
+}
+inline OperatorKernel* BoxedKernel::getFunctor() const {
+  return functor_.get();
+}
+inline BoxedKernel::InternalBoxedKernelFunction* BoxedKernel::getFnPtr() const {
+  return boxed_kernel_func_;
+}
+} // namespace c10
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction.h ADDED Viewed

	@@ -0,0 +1,346 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/boxing/BoxedKernel.h>
+#include <ATen/core/stack.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/intrusive_ptr.h>
+#include <atomic>
+#include <memory>
+#include <type_traits>
+namespace c10 {
+using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack
+                                 // to the c10 namespace.
+class OperatorHandle;
+struct OperatorKernel;
+class KernelFunction;
+class KernelToken;
+class SafeKernelFunction;
+template <typename T>
+using has_symint = std::disjunction<
+    std::is_same<c10::SymInt, T>,
+    std::is_same<c10::SymIntArrayRef, T>,
+    std::is_same<at::OptionalSymIntArrayRef, T>,
+    std::is_same<std::optional<c10::SymInt>, T>>;
+template <typename T>
+struct remove_symint {
+  using type = T;
+};
+template <>
+struct remove_symint<c10::SymInt> {
+  using type = int64_t;
+};
+template <>
+struct remove_symint<at::OptionalSymIntArrayRef> {
+  using type = OptionalIntArrayRef;
+};
+template <>
+struct remove_symint<c10::SymIntArrayRef> {
+  using type = c10::IntArrayRef;
+};
+template <>
+struct remove_symint<std::optional<c10::SymInt>> {
+  using type = std::optional<int64_t>;
+};
+template <bool symint, typename T>
+struct maybe_keep_symint final {};
+template <typename T>
+struct maybe_keep_symint<true, T> {
+  using type = T;
+};
+template <typename T>
+struct maybe_keep_symint<false, T> {
+  using type = typename remove_symint<T>::type;
+};
+template <typename T>
+using fn_has_symint = typename guts::typelist::true_for_any_type<
+    has_symint,
+    typename guts::infer_function_traits<T>::type::parameter_types>;
+template <typename T>
+struct fn_remove_symint;
+template <typename Ret, typename... Args>
+struct fn_remove_symint<Ret(Args...)> {
+  using type = Ret(typename remove_symint<Args>::type...);
+};
+/**
+ * KernelFunction is similar to std::function but stores a kernel function.
+ * You can create a KernelFunction from a boxed or unboxed
+ * function/functor/lambda and call it in a boxed or unboxed way. If the way it
+ * was created doesn't match the way it was called, it will do boxing or
+ * unboxing as necessary.
+ */
+class TORCH_API KernelFunction final {
+ public:
+  using InternalBoxedKernelFunction = BoxedKernel::InternalBoxedKernelFunction;
+  using BoxedKernelFunction = BoxedKernel::BoxedKernelFunction;
+  using BoxedKernelFunction_withDispatchKeys =
+      BoxedKernel::BoxedKernelFunction_withDispatchKeys;
+  KernelFunction();
+  ~KernelFunction();
+  KernelFunction(const KernelFunction& other);
+  KernelFunction& operator=(const KernelFunction& other);
+  KernelFunction(KernelFunction&&) noexcept = default;
+  // Fast path for dispatch to allow not touching the boxed kernel in
+  // the common case where unboxed is available.
+  bool isValidUnboxed() const;
+  bool isValidSymUnboxed() const;
+  bool isValid() const;
+  bool isFallthrough() const;
+  /**
+   * Call the function in a boxed way.
+   * If the kernel function was created with an unboxed function,
+   * this will call an unboxing wrapper which then calls into that
+   * unboxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func);
+   * > Tensor result = func.callBoxed(stack);
+   *
+   * Or, with an unboxed implementation:
+   *
+   * > KernelFunction func = KernelFunction::makeFromUnboxedLambda(
+   * >      [] (Tensor a, bool b) -> Tensor {...});
+   * > Tensor result = func.callBoxed(stack);
+   */
+  void callBoxed(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+  /**
+   * Call the function in an unboxed way.
+   * If the kernel function was created with a boxed function,
+   * this will box all inputs and then call into that boxed function.
+   *
+   * Note that this doesn't work for all types yet.
+   *
+   * Example:
+   *
+   * > KernelFunction func = KernelFunction::makeFromUnboxedLambda(
+   * >      [] (Tensor a, bool b) -> Tensor {...});
+   * > Tensor result = func.call<Tensor, Tensor, bool>(tensor1, true);
+   *
+   * Or, with a boxed implementation:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > KernelFunction func = KernelFunction::makeFromBoxedFunction(&boxed_func);
+   * > Tensor result = func.call<Tensor, Tensor, bool>(tensor1, true);
+   */
+  template <class Return, class... Args>
+  Return call(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Args... args) const;
+  /**
+   * Create a KernelFunction from a BoxedKernel.
+   */
+  static KernelFunction makeFromBoxedKernel(BoxedKernel boxed_fn);
+  /**
+   * Create a KernelFunction from a boxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > KernelFunction func =
+   * KernelFunction::makeFromBoxedFunction<&boxed_func>();
+   */
+  template <BoxedKernelFunction* func>
+  static KernelFunction makeFromBoxedFunction();
+  /**
+   * TODO: This will only be useful if we write a backend fallback that plumbs
+   * dispatch keys (currently there are none) See Note [Plumbing Keys Through
+   * The Dispatcher] for details.
+   */
+  template <BoxedKernelFunction_withDispatchKeys* func>
+  static KernelFunction makeFromBoxedFunction();
+  /**
+   * Create a KernelFunction from an unboxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     Tensor operator()(Tensor a, Tensor b) {...}
+   * > };
+   * > KernelFunction func =
+   * KernelFunction::makeFromUnboxedFunctor<MyFunctor>(std::make_unique<MyFunctor>());
+   */
+  template <bool AllowLegacyTypes = false, class KernelFunctor>
+  static KernelFunction makeFromUnboxedFunctor(
+      std::unique_ptr<OperatorKernel> kernelFunctor);
+  /**
+   * Create a KernelFunction from a boxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...}
+   * > };
+   * > KernelFunction func =
+   * KernelFunction::makeFromBoxedFunctor(std::make_unique<MyFunctor>());
+   */
+  template <class KernelFunctor>
+  static KernelFunction makeFromBoxedFunctor(
+      std::unique_ptr<KernelFunctor> kernelFunctor);
+  /**
+   * Create a KernelFunction from an unboxed function.
+   * This is usually better than KernelFunction::makeFromUnboxedRuntimeFunction
+   * because knowing the function pointer as a template argument (i.e. at
+   * compile time) allows the compiler to inline the function into its
+   * unboxing wrapper and yields better performance when calling the function.
+   *
+   * Example:
+   *
+   * > Tensor unboxed_func(Tensor a, Tensor b) {...}
+   * > KernelFunction func =
+   * KernelFunction::makeFromUnboxedFunction<decltype(unboxed_func),
+   * &unboxed_func>();
+   */
+  template <class FuncPtr, bool AllowLegacyTypes = false>
+  static KernelFunction makeFromUnboxedFunction(FuncPtr /*func_ptr*/);
+  /**
+   * Create a KernelFunction from an unboxed function.
+   * KernelFunction::makeFromUnboxedFunction is usually a better choice than
+   * this if you know the function pointer at compile time, see doc comment
+   * there for an explanation.
+   *
+   * Example:
+   *
+   * > Tensor unboxed_func(Tensor a, Tensor b) {...}
+   * > KernelFunction func =
+   * KernelFunction::makeFromUnboxedRuntimeFunction(&unboxed_func);
+   */
+  template <bool AllowLegacyTypes = false, class FuncType>
+  static KernelFunction makeFromUnboxedRuntimeFunction(FuncType* func);
+  static KernelFunction makeFallthrough();
+  static KernelFunction makeAmbiguousAutogradOther();
+  static KernelFunction makeNamedNotSupported();
+  /**
+   * Create a KernelFunction from an unboxed lambda.
+   *
+   * Example:
+   *
+   * > KernelFunction func = KernelFunction::makeFromUnboxedLambda(
+   * >      [] (Tensor a, bool b) -> Tensor {...});
+   */
+  template <bool AllowLegacyTypes = false, class Lambda>
+  static std::enable_if_t<
+      guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
+      KernelFunction>
+  makeFromUnboxedLambda(Lambda&& lambda);
+  template <bool AllowLegacyTypes = false, class Lambda>
+  static std::enable_if_t<
+      !guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
+      KernelFunction>
+  makeFromUnboxedLambda(Lambda&& lambda);
+  std::string dumpState() const;
+  // For testing internal invariants only
+  bool _equalsBoxedAndUnboxed(const KernelFunction& /*other*/) const;
+  // Register a token to be invalidated when this KernelFunction is destroyed
+  void registerToken(std::weak_ptr<KernelToken> token) const;
+ private:
+  explicit KernelFunction(
+      std::unique_ptr<OperatorKernel> functor,
+      InternalBoxedKernelFunction* boxed_kernel_func,
+      void* unboxed_kernel_func,
+      void* sym_unboxed_kernel_func);
+  explicit KernelFunction(
+      BoxedKernel boxed_fn,
+      void* unboxed_kernel_func,
+      void* sym_unboxed_kernel_func);
+  BoxedKernel boxed_kernel_func_;
+  void* unboxed_kernel_func_;
+  void* sym_unboxed_kernel_func_;
+  // List of tokens that need to be invalidated when this KernelFunction is
+  // destroyed (lazy allocation to save memory when empty)
+  mutable std::unique_ptr<std::vector<std::weak_ptr<KernelToken>>> tokens_;
+};
+// Token held by SafeKernelFunction that gets invalidated when KernelFunction is
+// destroyed
+class KernelToken {
+ public:
+  bool isValid() const;
+  void invalidate();
+ private:
+  std::atomic<bool> invalid_{false};
+};
+class SafeKernelFunction {
+ public:
+  SafeKernelFunction(
+      const KernelFunction* kernel,
+      std::string debug,
+      std::shared_ptr<OperatorHandle> opHandle);
+  // Safe callBoxed - checks token validity first
+  void callBoxed(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+  // Get debug information
+  const std::string& debug() const {
+    return debug_;
+  }
+  // Get the OpHandle that lives on this SafeKernelFunction
+  const OperatorHandle& opHandle() const {
+    return *opHandle_;
+  }
+ private:
+  KernelFunction kernel_;
+  std::shared_ptr<KernelToken> token_;
+  std::string debug_;
+  std::shared_ptr<OperatorHandle> opHandle_;
+};
+} // namespace c10
+#include <ATen/core/boxing/KernelFunction_impl.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/KernelFunction_impl.h ADDED Viewed

	@@ -0,0 +1,395 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <ATen/core/boxing/impl/WrapFunctionIntoFunctor.h>
+#include <ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h>
+#include <ATen/core/boxing/impl/boxing.h>
+#include <ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h>
+#include <c10/util/C++17.h>
+#include <type_traits>
+namespace c10 {
+namespace detail {
+template <typename Base, typename Child, typename... Args>
+std::enable_if_t<
+    !std::is_array_v<Base> && !std::is_array_v<Child> &&
+        std::is_base_of_v<Base, Child>,
+    std::unique_ptr<Base>>
+make_unique_base(Args&&... args) {
+  return std::make_unique<Child>(std::forward<Args>(args)...);
+}
+} // namespace detail
+inline KernelFunction::KernelFunction()
+    : unboxed_kernel_func_(nullptr), sym_unboxed_kernel_func_(nullptr) {}
+inline KernelFunction::~KernelFunction() {
+  if (tokens_) {
+    for (auto& weak_token : *tokens_) {
+      if (auto token = weak_token.lock()) {
+        token->invalidate();
+      }
+    }
+  }
+}
+inline KernelFunction::KernelFunction(const KernelFunction& other)
+    : boxed_kernel_func_(other.boxed_kernel_func_),
+      unboxed_kernel_func_(other.unboxed_kernel_func_),
+      sym_unboxed_kernel_func_(other.sym_unboxed_kernel_func_) {
+  // tokens_ is intentionally not copied as we only care about invalidating
+  // tokens if the original KernelFunction is destroyed
+}
+inline KernelFunction& KernelFunction::operator=(const KernelFunction& other) {
+  if (this != &other) {
+    boxed_kernel_func_ = other.boxed_kernel_func_;
+    unboxed_kernel_func_ = other.unboxed_kernel_func_;
+    sym_unboxed_kernel_func_ = other.sym_unboxed_kernel_func_;
+    // tokens_ is intentionally not copied as we only care about invalidating
+    // tokens if the original KernelFunction is destroyed
+  }
+  return *this;
+}
+inline KernelFunction::KernelFunction(
+    std::unique_ptr<OperatorKernel> functor,
+    InternalBoxedKernelFunction* boxed_kernel_func,
+    void* unboxed_kernel_func,
+    void* sym_unboxed_kernel_func = nullptr)
+    : boxed_kernel_func_(std::move(functor), boxed_kernel_func),
+      unboxed_kernel_func_(unboxed_kernel_func),
+      sym_unboxed_kernel_func_(sym_unboxed_kernel_func) {}
+inline KernelFunction::KernelFunction(
+    BoxedKernel boxed_fn,
+    void* unboxed_kernel_func,
+    void* sym_unboxed_kernel_func = nullptr)
+    : boxed_kernel_func_(std::move(boxed_fn)),
+      unboxed_kernel_func_(unboxed_kernel_func),
+      sym_unboxed_kernel_func_(sym_unboxed_kernel_func) {}
+inline bool KernelFunction::isValidUnboxed() const {
+  return unboxed_kernel_func_ != nullptr;
+}
+inline bool KernelFunction::isValidSymUnboxed() const {
+  return sym_unboxed_kernel_func_ != nullptr;
+}
+inline bool KernelFunction::isValid() const {
+  return boxed_kernel_func_.isValid();
+}
+inline bool KernelFunction::isFallthrough() const {
+  return boxed_kernel_func_.isFallthrough();
+}
+inline void KernelFunction::callBoxed(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  boxed_kernel_func_.callBoxed(opHandle, dispatchKeySet, stack);
+}
+template <class Return, class... Args>
+inline Return callUnboxedKernelFunction(
+    void* unboxed_kernel_func,
+    OperatorKernel* functor,
+    DispatchKeySet dispatchKeySet,
+    Args&&... args) {
+  using ActualSignature = Return(OperatorKernel*, DispatchKeySet, Args...);
+  ActualSignature* func =
+      reinterpret_cast<ActualSignature*>(unboxed_kernel_func);
+  return (*func)(functor, dispatchKeySet, std::forward<Args>(args)...);
+}
+// This template requires you to explicitly specify the argument you want to
+// forward; it doesn't work if you try to deduce it
+// NB: keep this in sync with cloneWithRealTypes in function_schema.cpp
+template <typename T>
+inline typename remove_symint<T>::type unpackSymInt(T x) {
+  return x;
+}
+template <>
+inline remove_symint<c10::SymInt>::type unpackSymInt(c10::SymInt x) {
+  return x.guard_int(__FILE__, __LINE__);
+}
+template <>
+inline remove_symint<c10::SymIntArrayRef>::type unpackSymInt(
+    c10::SymIntArrayRef x) {
+  return C10_AS_INTARRAYREF_SLOW(x);
+}
+template <>
+inline remove_symint<std::optional<c10::SymInt>>::type unpackSymInt(
+    std::optional<c10::SymInt> x) {
+  return x.has_value() ? std::make_optional(x->guard_int(__FILE__, __LINE__))
+                       : std::nullopt;
+}
+template <>
+inline remove_symint<at::OptionalSymIntArrayRef>::type unpackSymInt(
+    at::OptionalSymIntArrayRef x) {
+  return x.has_value() ? std::make_optional(C10_AS_INTARRAYREF_SLOW(*x))
+                       : std::nullopt;
+}
+template <class Return, class... Args>
+C10_ALWAYS_INLINE Return KernelFunction::call(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Args... args) const {
+  // note: Args above is intentionally not Args&&. We don't want perfect
+  // forwarding, which would require Args to be deduced, but instead we
+  // want callers to explicitly specify the Args.
+  if constexpr (std::disjunction_v<has_symint<Args>...>) {
+    if (sym_unboxed_kernel_func_ != nullptr) {
+      auto* functor = boxed_kernel_func_.getFunctor();
+      return callUnboxedKernelFunction<Return, Args...>(
+          sym_unboxed_kernel_func_,
+          functor,
+          dispatchKeySet,
+          std::forward<Args>(args)...);
+    }
+    if (unboxed_kernel_func_ != nullptr) {
+      auto* functor = boxed_kernel_func_.getFunctor();
+      return callUnboxedKernelFunction<
+          Return,
+          typename remove_symint<Args>::type...>(
+          unboxed_kernel_func_,
+          functor,
+          dispatchKeySet,
+          unpackSymInt<Args>(args)...);
+    }
+  } else {
+    if (C10_LIKELY(unboxed_kernel_func_ != nullptr)) {
+      auto* functor = boxed_kernel_func_.getFunctor();
+      return callUnboxedKernelFunction<Return, Args...>(
+          unboxed_kernel_func_,
+          functor,
+          dispatchKeySet,
+          std::forward<Args>(args)...);
+    }
+  }
+  return impl::BoxedKernelWrapper<Return(Args...)>::call(
+      boxed_kernel_func_,
+      opHandle,
+      dispatchKeySet,
+      std::forward<Args>(args)...);
+}
+inline void KernelFunction::registerToken(
+    std::weak_ptr<KernelToken> token) const {
+  if (!tokens_) {
+    tokens_ = std::make_unique<std::vector<std::weak_ptr<KernelToken>>>();
+  }
+  tokens_->push_back(std::move(token));
+}
+inline KernelFunction KernelFunction::makeFromBoxedKernel(
+    BoxedKernel boxed_fn) {
+  return KernelFunction(
+      std::move(boxed_fn), nullptr); // no unboxed function pointer
+}
+template <KernelFunction::BoxedKernelFunction* func>
+inline KernelFunction KernelFunction::makeFromBoxedFunction() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFromFunction<func>());
+}
+template <KernelFunction::BoxedKernelFunction_withDispatchKeys* func>
+inline KernelFunction KernelFunction::makeFromBoxedFunction() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFromFunction<func>());
+}
+inline KernelFunction KernelFunction::makeFallthrough() {
+  return KernelFunction::makeFromBoxedKernel(BoxedKernel::makeFallthrough());
+}
+inline KernelFunction KernelFunction::makeAmbiguousAutogradOther() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeAmbiguousAutogradOther());
+}
+inline KernelFunction KernelFunction::makeNamedNotSupported() {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeNamedNotSupported());
+}
+template <bool AllowLegacyTypes, class KernelFunctor>
+inline KernelFunction KernelFunction::makeFromUnboxedFunctor(
+    std::unique_ptr<OperatorKernel> kernelFunctor) {
+#ifndef NDEBUG
+  // This assertion is costly for build time so it's debug-gated.
+  static_assert(
+      guts::is_functor<KernelFunctor>::value,
+      "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor> but the argument is not a functor.");
+#endif
+  static_assert(
+      std::is_base_of_v<OperatorKernel, KernelFunctor>,
+      "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+  auto* unboxed_fn = &impl::wrap_kernel_functor_unboxed<KernelFunctor>::call;
+  void* void_unboxed_fn = reinterpret_cast<void*>(unboxed_fn);
+  bool is_symint = fn_has_symint<decltype(unboxed_fn)>::value;
+  return KernelFunction(
+      std::move(kernelFunctor),
+      &impl::make_boxed_from_unboxed_functor<KernelFunctor, AllowLegacyTypes>::
+          call,
+      is_symint ? nullptr : void_unboxed_fn,
+      is_symint ? void_unboxed_fn : nullptr);
+}
+template <class KernelFunctor>
+inline KernelFunction KernelFunction::makeFromBoxedFunctor(
+    std::unique_ptr<KernelFunctor> kernelFunctor) {
+  return KernelFunction::makeFromBoxedKernel(
+      BoxedKernel::makeFromFunctor(std::move(kernelFunctor)));
+}
+template <class FuncPtr, bool AllowLegacyTypes>
+inline KernelFunction KernelFunction::makeFromUnboxedFunction(
+    FuncPtr func_ptr) {
+  static_assert(
+      is_compile_time_function_pointer<FuncPtr>::value,
+      "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN.");
+  static_assert(
+      !std::is_same_v<typename FuncPtr::FuncType, BoxedKernelFunction>,
+      "Tried to call KernelFunction::makeFromUnboxedFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
+#if defined(__GNUC__) && defined(__SANITIZE_ADDRESS__) && !defined(__CUDACC__)
+  TORCH_INTERNAL_ASSERT(
+      FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr");
+#else
+  static_assert(
+      FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr");
+#endif
+#if !defined(C10_MOBILE)
+  (void)func_ptr; // Suppress unused variable warning
+  return makeFromUnboxedFunctor<
+      AllowLegacyTypes,
+      typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>(
+      detail::make_unique_base<
+          OperatorKernel,
+          typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>());
+#else
+  // On mobile, we rather want to optimize for binary size than for performance,
+  // so let's not inline the kernel into the wrapper but use
+  // makeFromUnboxedRuntimeFunction instead.
+  return makeFromUnboxedRuntimeFunction(func_ptr.func_ptr());
+#endif
+}
+template <bool AllowLegacyTypes, class FuncType>
+inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(
+    FuncType* func) {
+  static_assert(
+      guts::is_function_type<FuncType>::value,
+      "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type.");
+  static_assert(
+      !std::is_same_v<FuncType, BoxedKernelFunction>,
+      "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
+  TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr");
+  return makeFromUnboxedFunctor<
+      AllowLegacyTypes,
+      impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(
+      detail::make_unique_base<
+          OperatorKernel,
+          impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(func));
+}
+template <bool AllowLegacyTypes, class Lambda>
+inline std::enable_if_t<
+    guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
+    KernelFunction>
+KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) {
+  static_assert(
+      guts::is_functor<std::decay_t<Lambda>>::value,
+      "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type.");
+#if !defined(C10_MOBILE)
+  return makeFromUnboxedFunctor<
+      AllowLegacyTypes,
+      impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+      detail::make_unique_base<
+          OperatorKernel,
+          impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+          std::forward<Lambda>(lambda)));
+#else
+  // On mobile, we rather want to optimize for binary size than for performance,
+  // so let's not inline the kernel into the wrapper but use
+  // makeFromUnboxedRuntimeFunction instead.
+  using FuncType =
+      typename guts::infer_function_traits_t<std::decay_t<Lambda>>::func_type;
+  return makeFromUnboxedRuntimeFunction<AllowLegacyTypes, FuncType>(lambda);
+#endif
+}
+template <bool AllowLegacyTypes, class Lambda>
+inline std::enable_if_t<
+    !guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
+    KernelFunction>
+KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) {
+  static_assert(
+      guts::is_functor<std::decay_t<Lambda>>::value,
+      "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type.");
+  return makeFromUnboxedFunctor<
+      AllowLegacyTypes,
+      impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+      detail::make_unique_base<
+          OperatorKernel,
+          impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
+          std::forward<Lambda>(lambda)));
+}
+inline bool KernelToken::isValid() const {
+  return !invalid_.load(std::memory_order_acquire);
+}
+inline void KernelToken::invalidate() {
+  invalid_.store(true, std::memory_order_release);
+}
+inline SafeKernelFunction::SafeKernelFunction(
+    const KernelFunction* kernel,
+    std::string debug,
+    std::shared_ptr<OperatorHandle> opHandle)
+    : kernel_(kernel ? *kernel : KernelFunction()),
+      token_(std::make_shared<KernelToken>()),
+      debug_(std::move(debug)),
+      opHandle_(std::move(opHandle)) {
+  // Register the token with the original kernel so it gets invalidated when the
+  // kernel is destroyed
+  if (kernel) {
+    kernel->registerToken(token_);
+  }
+}
+inline void SafeKernelFunction::callBoxed(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  TORCH_CHECK(
+      token_ && token_->isValid(),
+      "SafeKernelFunction has been invalidated ",
+      debug_);
+  kernel_.callBoxed(opHandle, dispatchKeySet, stack);
+}
+} // namespace c10
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/OperatorKernel.h ADDED Viewed

	@@ -0,0 +1,32 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/util/intrusive_ptr.h>
+namespace c10 {
+/**
+ * Inherit from OperatorKernel to implement a c10 kernel.
+ *
+ * Example:
+ * > namespace {
+ * >   class my_kernel_cpu final : public c10::OperatorKernel {
+ * >   public:
+ * >     Tensor operator()(Tensor a, Tensor b) {...}
+ * >   };
+ * > }
+ *
+ * The kernel class is allowed to have members but these are equivalent
+ * to global variables. The kernel implementation is responsible for
+ * preventing race conditions on them.
+ *
+ * See below for how to register this kernel with PyTorch.
+ */
+struct TORCH_API OperatorKernel : public c10::intrusive_ptr_target {
+  ~OperatorKernel() override = default;
+};
+} // namespace c10
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h ADDED Viewed

	@@ -0,0 +1,43 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/CompileTimeFunctionPointer.h>
+namespace c10::impl {
+namespace detail {
+template <class FuncPtr, class ReturnType, class ParameterList>
+class WrapFunctionIntoFunctor_ {};
+template <class FuncPtr, class ReturnType, class... Parameters>
+class WrapFunctionIntoFunctor_<
+    FuncPtr,
+    ReturnType,
+    guts::typelist::typelist<Parameters...>>
+    final : public c10::OperatorKernel {
+ public:
+  C10_ALWAYS_INLINE decltype(auto) operator()(Parameters... args) {
+    return (*FuncPtr::func_ptr())(std::forward<Parameters>(args)...);
+  }
+};
+} // namespace detail
+// WrapFunctionIntoFunctor: Wraps a compile time function pointer into a kernel
+// functor. Since it is a compile time function pointer, many compilers can
+// inline it into the wrapper and you don't get any performance overhead for
+// wrapping.
+template <class FuncPtr>
+struct WrapFunctionIntoFunctor final {
+  static_assert(
+      c10::is_compile_time_function_pointer<FuncPtr>::value,
+      "WrapFunctionIntoFunctor can only wrap functions created with TORCH_FN.");
+  using type = detail::WrapFunctionIntoFunctor_<
+      FuncPtr,
+      typename guts::function_traits<typename FuncPtr::FuncType>::return_type,
+      typename guts::function_traits<
+          typename FuncPtr::FuncType>::parameter_types>;
+};
+} // namespace c10::impl
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/WrapFunctionIntoRuntimeFunctor.h ADDED Viewed

	@@ -0,0 +1,46 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/util/TypeTraits.h>
+namespace c10::impl {
+namespace detail {
+template <class FuncType, class ReturnType, class ParameterList>
+class WrapFunctionIntoRuntimeFunctor_ {};
+template <class FuncType, class ReturnType, class... Parameters>
+class WrapFunctionIntoRuntimeFunctor_<
+    FuncType,
+    ReturnType,
+    guts::typelist::typelist<Parameters...>>
+    final : public c10::OperatorKernel {
+ public:
+  template <class FuncType_>
+  explicit WrapFunctionIntoRuntimeFunctor_(FuncType_&& kernel_func)
+      : kernel_func_(std::forward<FuncType_>(kernel_func)) {}
+  decltype(auto) operator()(Parameters... args) {
+    return kernel_func_(std::forward<Parameters>(args)...);
+  }
+ private:
+  FuncType kernel_func_;
+};
+} // namespace detail
+// WrapFunctionIntoRuntimeFunctor: Wraps any runtime functor into a functor that
+// inherits from c10::OperatorKernel, so it can be used as a c10 kernel.
+// This can, for example, be used for lambdas, functors or even function
+// pointers. In the case of function pointers, since it is a runtime function
+// pointer, there is an overhead for calling it whenever the kernel is invoked.
+template <class FuncType>
+using WrapFunctionIntoRuntimeFunctor = detail::WrapFunctionIntoRuntimeFunctor_<
+    FuncType,
+    typename guts::infer_function_traits_t<FuncType>::return_type,
+    typename guts::infer_function_traits_t<FuncType>::parameter_types>;
+} // namespace c10::impl
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/boxing.h ADDED Viewed

	@@ -0,0 +1,415 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// This file contains boxing (not unboxing) logic,
+// i.e. how to make a vector<IValue> from a set of concrete arguments.
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <c10/core/TensorOptions.h>
+#include <ATen/core/boxing/BoxedKernel.h>
+#include <c10/util/Metaprogramming.h>
+#include <type_traits>
+namespace c10::impl {
+//
+// utils
+//
+// is_mutable_tensor_ref
+template <class T>
+struct is_mutable_tensor_ref : std::false_type {};
+template <>
+struct is_mutable_tensor_ref<at::Tensor&> : std::true_type {};
+// is_tuple_of_mutable_tensor_refs
+//
+template <class T, class Enable = void>
+struct is_tuple_of_mutable_tensor_refs : std::false_type {};
+template <class T>
+struct is_tuple_of_mutable_tensor_refs<
+    T,
+    std::enable_if_t<guts::is_instantiation_of<std::tuple, T>::value, void>>
+    : guts::typelist::
+          all<is_mutable_tensor_ref, guts::typelist::from_tuple_t<T>> {};
+// has_ivalue_to<T> tests the presence/absence of instance method
+// IValue::to<T>()
+//
+template <class T, class Enable = void>
+struct has_ivalue_to : std::false_type {};
+template <class T>
+struct ivalue_to_helper {
+  using type = decltype(std::declval<IValue>().template to<T>());
+};
+template <class T>
+using ivalue_to_helper_t = typename ivalue_to_helper<T>::type;
+template <class T>
+struct has_ivalue_to<T, std::void_t<ivalue_to_helper_t<T>>> : std::true_type {};
+//
+// boxing predicates
+//
+// A boxable arg type is one that IValue has a constructor for.
+template <typename T>
+using can_box = std::disjunction<
+    std::is_constructible<IValue, std::decay_t<T>>,
+    // TensorOptions are not directly constructible into IValue,
+    // but torch::jit::push knows how to handle them
+    std::is_same<TensorOptions, std::decay_t<T>>>;
+template <typename... Ts>
+using can_box_all = std::conjunction<can_box<Ts>...>;
+// an unboxable result is one that can be extracted from an IValue
+template <typename T>
+using can_unbox = std::conjunction<
+    std::disjunction<
+        has_ivalue_to<T>,
+        // void returns are ok
+        std::is_same<void, T>>,
+    std::negation<std::is_lvalue_reference<T>>>;
+//
+// boxArgs - utility for pushing unboxed args onto IValue stack
+//
+template <class... Args>
+torch::jit::Stack boxArgs(Args... args) {
+  // TODO Reuse stack vector instead of allocating?
+  torch::jit::Stack stack;
+  stack.reserve(sizeof...(Args));
+  torch::jit::push(stack, std::forward<Args>(args)...);
+  return stack;
+}
+template <class T>
+inline constexpr size_t boxed_size_one() {
+  static_assert(
+      !std::is_same_v<std::decay_t<T>, c10::TensorOptions>,
+      "need to patch this path to support TensorOptions passed by reference");
+  return 1;
+}
+// torch::jit::push pushes 4 values for a TensorOptions; this needs to
+// be kept in sync.
+template <>
+inline constexpr size_t boxed_size_one<c10::TensorOptions>() {
+  return 4;
+}
+// NOTE: this could probably be simplified with C++17 fold expressions.
+template <typename...>
+struct BoxedSize : std::integral_constant<size_t, 0> {};
+template <class T, class... Args>
+struct BoxedSize<T, Args...>
+    : std::integral_constant<
+          size_t,
+          boxed_size_one<T>() + BoxedSize<Args...>::value> {};
+template <class... Args>
+static inline constexpr size_t boxed_size() {
+  return BoxedSize<Args...>::value;
+}
+template <typename T>
+C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(IValue*& dest, T& arg) {
+  new (dest++) IValue(arg);
+}
+C10_ALWAYS_INLINE_UNLESS_MOBILE void boxToStack(
+    IValue*& dest,
+    c10::TensorOptions options) {
+  new (dest++) IValue(c10::typeMetaToScalarType(options.dtype()));
+  new (dest++) IValue(options.layout());
+  new (dest++) IValue(options.device());
+  new (dest++) IValue(options.pinned_memory());
+}
+inline void boxArgsToStack(IValue*& /*unused*/) {}
+template <typename T, typename... Args>
+C10_ALWAYS_INLINE_UNLESS_MOBILE void boxArgsToStack(
+    IValue*& dest,
+    T& arg,
+    Args&... args) {
+  boxToStack(dest, arg);
+  boxArgsToStack(dest, args...);
+}
+//
+// PopResult is a helper class whose specializations handle popping single and
+// multiple return values, respectively.
+//
+template <class Result>
+struct PopResult final {
+  static Result call(Stack& stack) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == 1,
+        "Boxed kernel was expected to return one value on the stack, ",
+        "but instead pushed ",
+        stack.size(),
+        " values.");
+    return std::move(stack[0]).to<Result>();
+  }
+};
+template <class... Types>
+struct PopResult<std::tuple<Types...>> final {
+  using Result = std::tuple<Types...>;
+  static Result call(Stack& stack) {
+    // for tuple return types, boxed kernel has pushed multiple values onto the
+    // stack
+    constexpr int RetCount = sizeof...(Types);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == RetCount,
+        "Boxed kernel was expected to return ",
+        RetCount,
+        " values on the stack, ",
+        "but instead pushed ",
+        stack.size(),
+        " values.");
+    return pop_to_tuple_impl(stack, std::make_index_sequence<RetCount>());
+  }
+ private:
+  // note: this has been moved into its own helper only to avoid a parse error
+  // on `indices` otherwise. I'm sure there's an incantation that slips it past
+  // the parser but eh
+  template <size_t... indices>
+  static Result pop_to_tuple_impl(
+      Stack& stack,
+      std::index_sequence<indices...> /*unused*/) {
+    return std::make_tuple((std::move(stack[indices]).template to<Types>())...);
+  }
+};
+//
+// BoxedKernelWrapper
+//
+// For a given function type FT, BoxedKernelWrapper<FT> implements
+// a `call` method that
+// - takes a boxed kernel and unboxed arguments as specified by FT,
+// - calls `boxArgs` to box the arguments
+// - calls the boxed kernel
+// - unboxes and returns the result
+//
+// The partial specializations below handle various cases: in
+// particular, not all types appearing in op signatures are supported,
+// and ops returning references have nonstandard wrapper implementations.
+//
+// 1. The base specialization of BoxedKernelWrapper should never be
+// instantiated. A "no call method defined on BoxedKernelWrapper" compile error
+// means that an op signature has failed to trigger any of the partial
+// specializations that follow this one.
+//
+template <class FuncType, class Enable = void>
+struct BoxedKernelWrapper {
+  // The reason we're not just doing straight up static_assert(false, ...) here:
+  // Basically, the way to make sure a static_assert only fires if a template
+  // is actually instantiated (rather than every time the file is parsed) is to
+  // use template parameters in the expression, e.g. FuncType here. However,
+  // since `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the
+  // same effect.
+  static_assert(
+      sizeof(FuncType) != sizeof(FuncType),
+      "Function signature contains one or more unsupported parameter and/or return types. "
+      "Look for a nearby error like "
+      "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" "
+      "- (your function type) is the unsupported signature.");
+};
+//
+// 2. Supported signatures, other than those involving non-const Tensor refs -
+// i.e., "functional" ops.
+//
+template <class Result, class... Args>
+struct BoxedKernelWrapper<
+    Result(Args...),
+    std::enable_if_t<
+        can_box_all<Args...>::value && can_unbox<Result>::value &&
+            !is_tuple_of_mutable_tensor_refs<Result>::value,
+        void>> {
+  static Result call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Args... args) {
+    torch::jit::Stack stack = boxArgs<Args...>(std::forward<Args>(args)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    if constexpr (!std::is_same_v<void, Result>) {
+      // op has pushed one or more values onto the stack.
+      return PopResult<Result>::call(stack);
+    } else {
+      // op returns void, boxed kernel has pushed nothing onto stack.
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          stack.empty(),
+          "Boxed kernel was expected to return no values on the stack, ",
+          "but instead returned ",
+          stack.size(),
+          " values.");
+    }
+  }
+};
+//
+// 3. in-place ops take a single non-const Tensor reference
+// as their first argument, and return it.
+//
+// Note: all signatures matching this pattern are assumed to be for such ops.
+// Because of this, the generated BoxedKernelWrapper specializations simply
+// return the in-place argument.
+//
+template <class... OtherArgs>
+struct BoxedKernelWrapper<
+    at::Tensor&(at::Tensor&, OtherArgs...),
+    std::enable_if_t<can_box_all<OtherArgs...>::value, void>> {
+  static at::Tensor& call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      at::Tensor& outArg,
+      OtherArgs... otherArgs) {
+    torch::jit::Stack stack = boxArgs<at::Tensor&, OtherArgs...>(
+        outArg, std::forward<OtherArgs>(otherArgs)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == 1,
+        "Boxed kernel was expected to return a single value on the stack, ",
+        "but instead returned ",
+        stack.size(),
+        " values.");
+    return outArg;
+  }
+};
+//
+// 3.5. In-process migration to make in-place ops take and return
+// const references instead.
+template <class... OtherArgs>
+struct BoxedKernelWrapper<
+    const at::Tensor&(const at::Tensor&, OtherArgs...),
+    std::enable_if_t<can_box_all<OtherArgs...>::value, void>> {
+  static const at::Tensor& call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      const at::Tensor& outArg,
+      OtherArgs... otherArgs) {
+    torch::jit::Stack stack = boxArgs(outArg, otherArgs...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == 1,
+        "Boxed kernel was expected to return a single value on the stack, ",
+        "but instead returned ",
+        stack.size(),
+        " values.");
+    return outArg;
+  }
+};
+//
+// 4. out of place ops that take a single non-const Tensor reference as their
+// final argument, and also return it.
+//
+// Note: all signatures matching this pattern are assumed to be for such ops.
+// This assumption permits the generated BoxedKernelWrapper specializations to
+// simply return out arguments.
+//
+template <class FirstArg, class... RestArgs>
+struct BoxedKernelWrapper<
+    at::Tensor&(FirstArg, RestArgs...),
+    std::enable_if_t<
+        can_box_all<FirstArg, RestArgs...>::value
+            // this skips over in-place kernels with a non-const Tensor
+            // arg at the front, so those can unambiguously trigger the
+            // preceding specialization.
+            && !is_mutable_tensor_ref<FirstArg>::value,
+        void>> {
+  static at::Tensor& call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      FirstArg firstArg,
+      RestArgs... restArgs) {
+    torch::jit::Stack stack = boxArgs<FirstArg, RestArgs...>(
+        std::forward<FirstArg>(firstArg), std::forward<RestArgs>(restArgs)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == 1,
+        "Boxed kernel was expected to return a single value on the stack, ",
+        "but instead returned ",
+        stack.size(),
+        " values.");
+    // reusing restArgs after it has been forwarded here is ok because we know
+    // that the last element is of type `Tensor&`.
+    return std::get<sizeof...(RestArgs) - 1>(
+        std::tuple<RestArgs...>{restArgs...});
+  }
+};
+//
+// 5. out of place ops that take multiple non-const Tensor references as their
+// final arguments, and return them in a std::tuple.
+//
+// Note: all signatures matching this pattern are assumed to be for such ops.
+// This assumption permits the generated BoxedKernelWrapper specializations to
+// simply return the out arguments.
+//
+template <class Result, class... Args>
+struct BoxedKernelWrapper<
+    Result(Args...),
+    std::enable_if_t<
+        can_box_all<Args...>::value &&
+            is_tuple_of_mutable_tensor_refs<Result>::value,
+        void>> {
+  static Result call(
+      const BoxedKernel& boxed_kernel_func,
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Args... args) {
+    using ArgTuple = std::tuple<Args...>;
+    constexpr int RetCount = std::tuple_size<Result>();
+    torch::jit::Stack stack = boxArgs<Args...>(std::forward<Args>(args)...);
+    boxed_kernel_func.callBoxed(opHandle, dispatchKeySet, &stack);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        stack.size() == RetCount,
+        "Boxed kernel was expected to return ",
+        RetCount,
+        " values on the stack, ",
+        "but instead returned ",
+        stack.size(),
+        " values.");
+    // reusing args after it has been forwarded here is ok because we know
+    // that the last RetCount elements are of type `Tensor&`.
+    auto result = guts::tuple_take<ArgTuple, -RetCount>(
+        ArgTuple{std::forward<Args>(args)...});
+    static_assert(
+        std::is_same_v<Result, decltype(result)>,
+        "The parameter list of an op returning a tuple of Tensor references "
+        "must end with an equal number of Tensor reference parameters.");
+    return result;
+  }
+};
+} // namespace c10::impl
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h ADDED Viewed

	@@ -0,0 +1,790 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/IListRef.h>
+#include <ATen/core/boxing/OperatorKernel.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/intrusive_ptr.h>
+#include <utility>
+namespace c10 {
+using Stack = torch::jit::Stack; // TODO Instead of this, move torch::jit::Stack
+                                 // to the c10 namespace.
+class OperatorHandle;
+/*
+ * [Note: Argument forwarding in the dispatcher]
+ *
+ * The dispatcher uses a somewhat unusual way to forward arguments through
+ * several layers of wrapper functions. This can be confusing because an
+ * experienced C++ programmer would look at this and think "oh this is supposed
+ * to be forwarding a universal reference but the && is missing. This is a
+ * bug.". It is not a bug. The common way in C++ to forward arguments is to use
+ * universal references:
+ *
+ * > template<class T> void func(T&& arg) { func2(std::forward<T>(arg)); }
+ *
+ * but that relies on inferring the correct reference type (i.e. value vs & vs
+ * &&) from the argument. In our case, we cannot rely on the argument as
+ * supplied by the caller, because that could infer a different reference type
+ * than was used in the kernel function. The correct reference type is dictated
+ * by the kernel signature and must be identical since we cast function pointers
+ * through void* pointers and mismatches would be UB. So we need a forwarding
+ * pattern that determines the reference type to use by looking at the
+ * explicitly supplied operator signature, not by looking at the argument we're
+ * calling it with.
+ *
+ * What does std::forward do, exactly?
+ * ------------------------------------
+ * std::forward<T>(t) is a way to cast t to the reference type supplied in T.
+ * Let's assume decay_t<T> == U and T is either U or some reference of U.
+ *  - std::forward<T&>(t) will return U&, no matter what kind of reference t is.
+ *  - std::forward<T&&>(t) will return U&&, no matter what kind of reference t
+ * is.
+ *  - std::forward<T>(t) will return U&& (not U!), no matter what kind of
+ * reference t is.
+ *
+ * For universal references, that means that in the following function
+ * > template<class T> void func(T&& arg) { func2(std::forward<T>(arg)); }
+ *
+ *  - when called with arg being a rvalue reference or non-reference value, T
+ * gets inferred to be a non-reference U, and std::forward<T>(t) will return
+ * U&&, correctly moving the argument.
+ *  - when called with arg behind a lvalue reference, T gets inferred to be U&
+ * because that's the only way to match the signature (in C++, a type that is
+ * (T&)&& will collapse to T&). That means std::forward<T>(t) will return U& and
+ * the value will not be moved but passed on as a lvalue reference.
+ *
+ * How do we use that?
+ * ------------------------------------
+ * But std::forward can also be used outside of the common "universal
+ * forwarding" pattern to change reference types. So instead of following the
+ * common C++ pattern, we notice what std::forward<T>() actually does, and that
+ * is it takes a value and changes its reference to the type of reference passed
+ * in as T. If we don't infer T but explicitly specify it, we can use this to
+ * forward based on an explicitly specified reference type instead of the
+ * inferred argument type.
+ *
+ * This is why many of the dispatcher functions look like
+ * > template<class T> func(T t) { func2<T>(std::forward<T>(t)); }
+ * instead of the common
+ * > template<class T> func(T&& t) { func2(std::forward<T>(t)); }
+ *
+ * and are expected to be called by explicitly specifying the template
+ * parameters in a way that matches the expected operator signature at each call
+ * site.
+ */
+namespace impl {
+// supported_primitive_arg_types defines which primitive types we allow in
+// kernel functions as arguments or returns.
+// Additionally, we support lists, dicts and optionals containing these types.
+using supported_primitive_arg_types = guts::typelist::typelist<
+    int64_t,
+    double,
+    bool,
+    std::string_view,
+    at::Tensor,
+    at::Scalar,
+    c10::QScheme,
+    c10::ScalarType,
+    c10::Device,
+    c10::DeviceIndex,
+    c10::Layout,
+    c10::MemoryFormat,
+    at::Dimname>;
+// We have an unboxed functor in hand that takes C++ arguments, and
+// we're building a boxed functor wrapper for it that takes IValues.
+// So "outside" is boxed and "inside" is unboxed.
+//
+// So a valid input type is one that our boxed functor wrapper can
+// unbox from an IValue into a C++ value.
+//
+// Whereas a valid output type is one that our wrapper can receive
+// as a C++ value from the unboxed functor, and box into an IValue.
+//
+// assert_is_valid_input_type
+// checks that T can be unboxed from an IValue into a C++ value.
+//
+template <class T, bool AllowDeprecatedTypes, class Enable = void>
+struct assert_is_valid_input_type {
+  assert_is_valid_input_type() {
+    if constexpr (guts::typelist::contains<supported_primitive_arg_types, T>::
+                      value) {
+      /* everything is ok, this is a primitive type */
+    } else {
+      /* otherwise this must be an instance of a valid custom class, since it
+         can only have been created via IValue(x), which ensures this. */
+    }
+  }
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<std::optional<T>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {};
+template <bool AllowDeprecatedTypes, class... Args>
+struct TypeCheckHelper;
+template <bool AllowDeprecatedTypes>
+struct TypeCheckHelper<AllowDeprecatedTypes> {};
+template <bool AllowDeprecatedTypes, class Head, class... Rest>
+struct TypeCheckHelper<AllowDeprecatedTypes, Head, Rest...>
+    : TypeCheckHelper<AllowDeprecatedTypes, Rest...> {
+  assert_is_valid_input_type<Head, AllowDeprecatedTypes> check;
+};
+template <class... Contained, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    std::tuple<Contained...>,
+    AllowDeprecatedTypes>
+    : TypeCheckHelper<AllowDeprecatedTypes, Contained...> {};
+template <class Key, class Value, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<Dict<Key, Value>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<Value, AllowDeprecatedTypes> {
+  static_assert(
+      guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported input type: Dict<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+};
+template <class Key, class Value, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    std::unordered_map<Key, Value>,
+    AllowDeprecatedTypes>
+    : assert_is_valid_input_type<Value, AllowDeprecatedTypes> {
+  static_assert(
+      AllowDeprecatedTypes,
+      "You tried to register a kernel with an unsupported input type: std::unordered_map<Key, Value>. Please use Dict<Key, Value> instead.");
+  static_assert(
+      guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported input type: std::unordered_map<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<List<T>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported input type: List<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<c10::ArrayRef<T>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported input type: ArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    c10::OptionalArrayRef<T>,
+    AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported input type: OptionalArrayRef<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+};
+template <class T, size_t N, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<std::array<T, N>, AllowDeprecatedTypes>
+    : assert_is_valid_input_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported input type: std::array<Scalar, N>. Please use std::array<int64_t, N> instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<float, T>>> {
+  // There is no reason to support float when we have double. Keep the API lean.
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported input type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<const char*, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported input type: const char*. Please use std::string_view instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<std::vector<bool>, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported input type: vector<bool>. Please use List<bool> instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<
+        std::is_integral_v<T> &&
+        !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_input_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<const c10::SymInt&, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel taking c10::SymInt by reference. Please accept it by value instead.");
+};
+// TODO: it probably would be good to tighten this up quite a bit more with
+// an explicit list for everything
+//
+// assert_is_valid_output_type
+//
+template <class T, bool AllowDeprecatedTypes, class Enable = void>
+struct assert_is_valid_output_type {
+  assert_is_valid_output_type() {
+    if constexpr (guts::typelist::contains<supported_primitive_arg_types, T>::
+                      value) {
+      /* everything is ok, this is a primitive type */
+    } else {
+      /* otherwise T is verified to be a registered custom class in the IValue
+        constructor, so no benefit in double-checking here */
+    }
+  }
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<std::optional<T>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    c10::OptionalArrayRef<T>,
+    AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {};
+template <class Key, class Value, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<Dict<Key, Value>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<Value, AllowDeprecatedTypes> {
+  static_assert(
+      guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported output type: Dict<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+  static_assert(
+      !std::is_same_v<Value, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: Dict<Key, Scalar>. Please use Dict<Key, int64_t> or Dict<Key, double>.");
+};
+template <class Key, class Value, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    std::unordered_map<Key, Value>,
+    AllowDeprecatedTypes>
+    : assert_is_valid_output_type<Value, AllowDeprecatedTypes> {
+  static_assert(
+      AllowDeprecatedTypes,
+      "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Value>. Please use Dict<Key, Value> instead.");
+  static_assert(
+      guts::typelist::contains<impl::valid_dict_key_types, Key>::value,
+      "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Value> where Key is invalid. We only support int64_t, double, bool, and string.");
+  static_assert(
+      !std::is_same_v<Value, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: std::unordered_map<Key, Scalar>. Please use Dict<Key, int64_t> or Dict<Key, double>.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<List<T>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: List<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<std::vector<T>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: std::vector<Scalar>. Please use List<int64_t>, List<double> or Tensor instead.");
+  // TODO static_assert(AllowDeprecatedTypes, "You tried to register a kernel
+  // with an unsupported output type: std::vector<T>. Please use List<T>
+  // instead.");
+};
+template <class T, size_t N, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<std::array<T, N>, AllowDeprecatedTypes>
+    : assert_is_valid_output_type<T, AllowDeprecatedTypes> {
+  static_assert(
+      !std::is_same_v<T, at::Scalar>,
+      "You tried to register a kernel with an unsupported output type: std::array<Scalar, N>. Please use std::array<int64_t, N> instead.");
+};
+// The following specialisations of assert_is_valid_output_type are technically
+// not necessary since we would hit the base case and show an error message
+// there if they didn't exist, but we can show a better error message
+// in some common error scenarios.
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<float, T>>> {
+  // There is no reason to support float when we have double. Keep the API lean.
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported output type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<const char*, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported output type: const char*. Please use std::string_view instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<std::is_same_v<std::vector<bool>, T>>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported output type: vector<bool>. Please use List<bool> instead.");
+};
+template <class T, bool AllowDeprecatedTypes>
+struct assert_is_valid_output_type<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<
+        std::is_integral_v<T> &&
+        !guts::typelist::contains<supported_primitive_arg_types, T>::value>> {
+  static_assert(
+      guts::false_t<T>::value,
+      "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string.");
+};
+// ivalue_to_arg
+template <class T>
+struct decay_if_not_tensor final {
+  using type = std::decay_t<T>;
+};
+template <>
+struct decay_if_not_tensor<at::Tensor&> final {
+  using type = at::Tensor&;
+};
+template <>
+struct decay_if_not_tensor<const at::Tensor&> final {
+  using type = const at::Tensor&;
+};
+template <class T, bool AllowDeprecatedTypes>
+struct ivalue_to_arg final {
+  static decltype(auto) call(IValue& v) {
+    assert_is_valid_input_type<T, AllowDeprecatedTypes>();
+    return std::move(v).to<T>();
+  }
+};
+// The following two specializations take advantage of specialized
+// `toTensor()` overloads on IValue to avoid copying.
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<at::Tensor&, AllowDeprecatedTypes> final {
+  // We cannot use the default implementation if they asked for a
+  // `at::Tensor&` because it moves from the IValue, so it can't get
+  // an lvalue reference.
+  static at::Tensor& call(IValue& v) {
+    // Tensor& is valid, don't bother asserting
+    return v.toTensor();
+  }
+};
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<const at::Tensor&, AllowDeprecatedTypes> final {
+  // We should not use the default implementation if they asked for
+  // a `const at::Tensor&` because it moves from the IValue and they
+  // didn't ask for that.
+  static const at::Tensor& call(IValue& v) {
+    // const Tensor& is valid, don't bother asserting
+    return v.toTensor();
+  }
+};
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<at::ITensorListRef, AllowDeprecatedTypes> final {
+  static List<at::Tensor> call(IValue& v) {
+    return v.toTensorList();
+  }
+};
+template <class T, bool AllowDeprecatedTypes>
+struct ivalue_to_arg<ArrayRef<T>, AllowDeprecatedTypes> final {
+  // If an argument is ArrayRef<T>, convert the IValue to a std::vector<T> and
+  // pass that to the operator. std::vector<T> is implicitly convertible to
+  // ArrayRef<T>.
+  static std::vector<T> call(IValue& v) {
+    return ivalue_to_arg<std::vector<T>, AllowDeprecatedTypes>::call(v);
+  }
+};
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<c10::SymIntArrayRef, AllowDeprecatedTypes> final {
+  static std::vector<c10::SymInt> call(IValue& v) {
+    if (v.isIntList()) {
+      std::vector<c10::SymInt> r;
+      auto src = v.toIntList();
+      std::transform(
+          src.begin(), src.end(), std::back_inserter(r), [](int64_t i) {
+            return c10::SymInt(i);
+          });
+      return r;
+    } else {
+      return ivalue_to_arg<std::vector<c10::SymInt>, AllowDeprecatedTypes>::
+          call(v);
+    }
+  }
+};
+template <bool AllowDeprecatedTypes>
+struct ivalue_to_arg<c10::OptionalArray<c10::SymInt>, AllowDeprecatedTypes>
+    final {
+  static OptionalArray<c10::SymInt> call(IValue& v) {
+    if (v.isIntList()) {
+      std::vector<c10::SymInt> r;
+      auto src = v.toIntList();
+      std::transform(
+          src.begin(), src.end(), std::back_inserter(r), [](int64_t i) {
+            return c10::SymInt(i);
+          });
+      return OptionalArray<c10::SymInt>(std::move(r));
+    } else {
+      return std::move(v).to<OptionalArray<c10::SymInt>>();
+    }
+  }
+};
+template <class T, bool AllowDeprecatedTypes>
+struct ivalue_to_arg<std::optional<ArrayRef<T>>, AllowDeprecatedTypes> final {
+  // If an argument is std::optional<ArrayRef<T>>, convert the IValue to an
+  // std::optional<std::vector<T>> and pass that to the operator.
+  // OptionalArray<T> is basically a std::optional<std::vector<T>> but
+  // implicitly convertible to std::optional<ArrayRef<T>>.
+  static OptionalArray<T> call(IValue& v) {
+    return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
+  }
+};
+template <class T, bool AllowDeprecatedTypes>
+struct ivalue_to_arg<OptionalArrayRef<T>, AllowDeprecatedTypes> final {
+  // If an argument is OptionalArrayRef<T>, convert the IValue to an
+  // std::optional<std::vector<T>> and pass that to the operator.
+  // OptionalArray<T> is basically a std::optional<std::vector<T>> but
+  // implicitly convertible to OptionalArrayRef<T>
+  static OptionalArray<T> call(IValue& v) {
+    return ivalue_to_arg<OptionalArray<T>, AllowDeprecatedTypes>::call(v);
+  }
+};
+// return_to_ivalue
+template <class T, bool AllowDeprecatedTypes, class Enable = void>
+struct return_to_ivalue final {};
+template <class T, bool AllowDeprecatedTypes>
+struct return_to_ivalue<
+    T,
+    AllowDeprecatedTypes,
+    std::enable_if_t<!std::is_same_v<at::Tensor&, T>>>
+    final {
+  static IValue call(T&& v) {
+    assert_is_valid_output_type<T, AllowDeprecatedTypes>();
+    return c10::ivalue::from(std::move(v));
+  }
+  static IValue copy(const T& v) {
+    assert_is_valid_output_type<T, AllowDeprecatedTypes>();
+    return IValue(v);
+  }
+};
+// Special case to allow kernels to return `Tensor&`.
+// TODO Delete this once kernels don't do that anymore
+template <bool AllowDeprecatedTypes>
+struct return_to_ivalue<at::Tensor&, AllowDeprecatedTypes, void> final {
+  static IValue call(at::Tensor& v) {
+    return c10::ivalue::from(v);
+  }
+  static IValue copy(at::Tensor& v) {
+    return IValue(v);
+  }
+};
+// wrap_kernel_functor_unboxed_
+template <class KernelFunctor, class OpSignature>
+struct wrap_kernel_functor_unboxed_ final {};
+// This specialization is for kernels with a first argument that is NOT of type
+// DispatchKeySet This includes kernels with 0 arguments.
+template <class KernelFunctor, class ReturnType, class... ParameterTypes>
+struct wrap_kernel_functor_unboxed_<
+    KernelFunctor,
+    ReturnType(ParameterTypes...)>
+    final {
+  static_assert(
+      std::is_same_v<
+          ReturnType,
+          typename guts::infer_function_traits_t<KernelFunctor>::return_type>,
+      "Return type mismatch");
+  static_assert(
+      std::is_same_v<
+          guts::typelist::typelist<ParameterTypes...>,
+          typename guts::infer_function_traits_t<
+              KernelFunctor>::parameter_types>,
+      "Parameter types mismatch");
+  // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes
+  // doesn't use &&
+  static ReturnType call(
+      OperatorKernel* functor,
+      DispatchKeySet /*unused*/,
+      ParameterTypes... args) {
+    KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
+    // Note [Plumbing Keys Through The Dispatcher 2]
+    // See Note [Plumbing Keys Through The Dispatcher] for the background.
+    // This functor explicitly takes in a dispatchKeySet and drops it on the
+    // floor- it does not forward it to the registered kernel.
+    //
+    // This is due to the calling convention within the dispatcher, which
+    // expects all registered kernels to have a first argument of type
+    // DispatchKeySet.
+    // This is not the case for pretty much all manually written kernels,
+    // however- this functor serves to separate the calling convention of the
+    // dispatcher from the calling convention of manually written kernels.
+    return (*functor_)(std::forward<ParameterTypes>(args)...);
+  }
+};
+// This specialization is for kernels with a first argument of type
+// DispatchKeySet
+template <class KernelFunctor, class ReturnType, class... ParameterTypes>
+struct wrap_kernel_functor_unboxed_<
+    KernelFunctor,
+    ReturnType(DispatchKeySet, ParameterTypes...)>
+    final {
+  static_assert(
+      std::is_same_v<
+          ReturnType,
+          typename guts::infer_function_traits_t<KernelFunctor>::return_type>,
+      "Return type mismatch");
+  static_assert(
+      std::is_same_v<
+          guts::typelist::typelist<DispatchKeySet, ParameterTypes...>,
+          typename guts::infer_function_traits_t<
+              KernelFunctor>::parameter_types>,
+      "Parameter types mismatch");
+  // See [Note: Argument forwarding in the dispatcher] for why ParameterTypes
+  // doesn't use &&
+  static ReturnType call(
+      OperatorKernel* functor,
+      DispatchKeySet dispatchKeySet,
+      ParameterTypes... args) {
+    KernelFunctor* functor_ = static_cast<KernelFunctor*>(functor);
+    // We're explicitly taking in a dispatchKeySet and forwarding it to the
+    // registered kernel. See Note [Plumbing Keys Through The Dispatcher 2] for
+    // details.
+    return (*functor_)(dispatchKeySet, std::forward<ParameterTypes>(args)...);
+  }
+};
+template <class KernelFunctor>
+using wrap_kernel_functor_unboxed = wrap_kernel_functor_unboxed_<
+    KernelFunctor,
+    typename guts::infer_function_traits_t<KernelFunctor>::func_type>;
+// call_functor_with_args_from_stack
+template <
+    class Functor,
+    bool AllowDeprecatedTypes,
+    size_t... ivalue_arg_indices,
+    typename... ArgTypes>
+std::decay_t<typename guts::infer_function_traits_t<Functor>::return_type>
+call_functor_with_args_from_stack_(
+    OperatorKernel* functor,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack,
+    std::index_sequence<ivalue_arg_indices...> /*unused*/,
+    guts::typelist::typelist<ArgTypes...>* /*unused*/) {
+  (void)stack; // when sizeof...(ivalue_arg_indices) == 0, this argument would
+               // be unused and we have to silence the compiler warning.
+  // We're explicitly filtering out DispatchKeySet from the argument list.
+  // Some kernels take a DispatchKeySet as their first argument in order to
+  // plumb keys through the dispatcher. We don't want to expose the
+  // DispatchKeySet type to jit, so we don't include this argument on the stack.
+  // See Note [Plumbing Keys Through The Dispatcher] for the background.
+  return wrap_kernel_functor_unboxed<Functor>::call(
+      functor,
+      dispatchKeySet,
+      ivalue_to_arg<
+          typename decay_if_not_tensor<ArgTypes>::type,
+          AllowDeprecatedTypes>::
+          call(torch::jit::peek(
+              *stack, ivalue_arg_indices, sizeof...(ivalue_arg_indices)))...);
+}
+template <class Functor, bool AllowDeprecatedTypes>
+std::decay_t<typename guts::infer_function_traits_t<Functor>::return_type>
+call_functor_with_args_from_stack(
+    OperatorKernel* functor,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) {
+  // We're explicitly filtering out DispatchKeySet from the argument list.
+  // Some kernels take a DispatchKeySet as their first argument in order to
+  // plumb keys through the dispatcher. We don't want to expose the
+  // DispatchKeySet type to jit, so we don't include this argument on the stack.
+  // See Note [Plumbing Keys Through The Dispatcher] for the background.
+  using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func<
+      Functor>::parameter_types;
+  constexpr size_t num_ivalue_args = guts::typelist::size<ArgTypes>::value;
+  return call_functor_with_args_from_stack_<Functor, AllowDeprecatedTypes>(
+      functor,
+      dispatchKeySet,
+      stack,
+      std::make_index_sequence<num_ivalue_args>(),
+      static_cast<ArgTypes*>(nullptr));
+}
+// push_outputs
+template <class OutputType, bool AllowDeprecatedTypes>
+struct push_outputs final {
+  // Contrary to [Note: Argument forwarding in the dispatcher], we use
+  // OutputType&& here to avoid one extra call to the move constructor in this
+  // case. This is still not a universal reference though because OutputType is
+  // an explicitly specified class template parameter.
+  static void call(OutputType&& output, Stack* stack) {
+    torch::jit::push(
+        *stack,
+        return_to_ivalue<OutputType, AllowDeprecatedTypes>::call(
+            std::forward<OutputType>(output)));
+  }
+  static void copy(const OutputType& output, Stack* stack) {
+    torch::jit::push(
+        *stack,
+        return_to_ivalue<OutputType, AllowDeprecatedTypes>::copy(output));
+  }
+};
+template <class... OutputTypes, bool AllowDeprecatedTypes>
+struct push_outputs<std::tuple<OutputTypes...>, AllowDeprecatedTypes> final {
+  static void call(std::tuple<OutputTypes...>&& output, Stack* stack) {
+    call_(
+        std::move(output),
+        stack,
+        std::make_index_sequence<sizeof...(OutputTypes)>());
+  }
+  static void copy(const std::tuple<OutputTypes...>& output, Stack* stack) {
+    copy_(output, stack, std::make_index_sequence<sizeof...(OutputTypes)>());
+  }
+ private:
+  template <size_t... indices>
+  static void call_(
+      std::tuple<OutputTypes...>&& output,
+      Stack* stack,
+      std::index_sequence<indices...> /*unused*/) {
+    torch::jit::push(
+        *stack,
+        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::call(
+            std::forward<OutputTypes>(std::get<indices>(output)))...);
+  }
+  template <size_t... indices>
+  static void copy_(
+      const std::tuple<OutputTypes...>& output,
+      Stack* stack,
+      std::index_sequence<indices...> /*unused*/) {
+    torch::jit::push(
+        *stack,
+        return_to_ivalue<OutputTypes, AllowDeprecatedTypes>::copy(
+            std::get<indices>(output))...);
+  }
+};
+template <bool AllowDeprecatedTypes>
+struct push_outputs<void, AllowDeprecatedTypes> final {
+  static void call(int /*dummy*/, Stack* /*stack*/) {}
+  static void copy(int /*dummy*/, Stack* /*stack*/) {}
+};
+// make_boxed_from_unboxed_functor
+template <class KernelFunctor, bool AllowDeprecatedTypes>
+struct make_boxed_from_unboxed_functor final {
+  static_assert(
+      std::is_base_of_v<OperatorKernel, KernelFunctor>,
+      "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+  static void call(
+      OperatorKernel* functor,
+      const OperatorHandle& /*unused*/,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) {
+    using ReturnType =
+        typename guts::infer_function_traits_t<KernelFunctor>::return_type;
+    // We're explicitly filtering out DispatchKeySet from the argument list.
+    // Some kernels take a DispatchKeySet as their first argument in order to
+    // plumb keys through the dispatcher. We don't want to expose the
+    // DispatchKeySet type to jit, so we don't include this argument on the
+    // stack. See Note [Plumbing Keys Through The Dispatcher] for the
+    // background.
+    using ArgTypes = typename c10::remove_DispatchKeySet_arg_from_func<
+        KernelFunctor>::parameter_types;
+    constexpr bool has_outputs = !std::is_same_v<void, ReturnType>;
+    constexpr size_t num_inputs = guts::typelist::size<ArgTypes>::value;
+    if constexpr (has_outputs) {
+      // Decay ReturnType to ReturnType_ so that if a reference gets returned,
+      // we actually store it by value and don't get a dangling reference. This
+      // is only required because some kernels still return `Tensor&`. [Note:
+      // VC++ and 'std': ambiguous symbol]
+      using ReturnType_ = ::std::decay_t<ReturnType>;
+      ReturnType_ output = call_functor_with_args_from_stack<
+          KernelFunctor,
+          AllowDeprecatedTypes>(functor, dispatchKeySet, stack);
+      torch::jit::drop(*stack, num_inputs);
+      // See note [ VC++ and 'std': ambiguous symbol]
+      push_outputs<ReturnType_, AllowDeprecatedTypes>::call(
+          ::std::move(output), stack);
+    } else {
+      call_functor_with_args_from_stack<KernelFunctor, AllowDeprecatedTypes>(
+          functor, dispatchKeySet, stack);
+      torch::jit::drop(*stack, num_inputs);
+    }
+  }
+};
+} // namespace impl
+} // namespace c10
+namespace torch {
+using OperatorKernel = c10::OperatorKernel;
+}
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/boxing/impl/test_helpers.h ADDED Viewed

	@@ -0,0 +1,145 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/irange.h>
+template <class... Inputs>
+inline std::vector<c10::IValue> makeStack(Inputs&&... inputs) {
+  return {std::forward<Inputs>(inputs)...};
+}
+inline at::Tensor dummyTensor(
+    c10::DispatchKeySet ks,
+    bool requires_grad = false) {
+  auto* allocator = c10::GetCPUAllocator();
+  int64_t nelements = 1;
+  auto dtype = caffe2::TypeMeta::Make<float>();
+  int64_t size_bytes = nelements * dtype.itemsize();
+  auto storage_impl = c10::make_intrusive<c10::StorageImpl>(
+      c10::StorageImpl::use_byte_size_t(),
+      size_bytes,
+      allocator->allocate(size_bytes),
+      allocator,
+      /*resizable=*/true);
+  at::Tensor t =
+      at::detail::make_tensor<c10::TensorImpl>(storage_impl, ks, dtype);
+  // TODO: We add this to simulate the ideal case where we only have Autograd
+  // backend keys
+  //       on Tensor when it requires grad. But currently Autograd keys are
+  //       added in TensorImpl constructor by default.
+  if (!requires_grad) {
+    t.unsafeGetTensorImpl()->remove_autograd_key();
+  }
+  return t;
+}
+inline at::Tensor dummyTensor(
+    c10::DispatchKey dispatch_key,
+    bool requires_grad = false) {
+  return dummyTensor(c10::DispatchKeySet(dispatch_key), requires_grad);
+}
+template <class... Args>
+inline std::vector<c10::IValue> callOp(
+    const c10::OperatorHandle& op,
+    Args... args) {
+  auto stack = makeStack(std::forward<Args>(args)...);
+  op.callBoxed(&stack);
+  return stack;
+}
+template <class Result, class... Args>
+inline Result callOpUnboxed(const c10::OperatorHandle& op, Args... args) {
+  return op.typed<Result(Args...)>().call(std::forward<Args>(args)...);
+}
+template <class Result, class... Args>
+inline Result callOpUnboxedWithDispatchKey(
+    const c10::OperatorHandle& op,
+    c10::DispatchKey dispatchKey,
+    Args... args) {
+  return op.typed<Result(Args...)>().callWithDispatchKey(
+      dispatchKey, std::forward<Args>(args)...);
+}
+template <class Result, class... Args>
+inline Result callOpUnboxedWithPrecomputedDispatchKeySet(
+    const c10::OperatorHandle& op,
+    c10::DispatchKeySet ks,
+    Args... args) {
+  return op.typed<Result(Args...)>().redispatch(
+      ks, std::forward<Args>(args)...);
+}
+inline void expectDoesntFindKernel(
+    const char* op_name,
+    c10::DispatchKey dispatch_key) {
+  auto op = c10::Dispatcher::singleton().findSchema({op_name, ""});
+  EXPECT_ANY_THROW(callOp(*op, dummyTensor(dispatch_key), 5););
+}
+inline void expectDoesntFindOperator(const char* op_name) {
+  auto op = c10::Dispatcher::singleton().findSchema({op_name, ""});
+  EXPECT_FALSE(op.has_value());
+}
+template <class Exception, class Functor>
+inline void expectThrows(Functor&& functor, const char* expectMessageContains) {
+  try {
+    std::forward<Functor>(functor)();
+  } catch (const Exception& e) {
+    EXPECT_THAT(e.what(), testing::HasSubstr(expectMessageContains));
+    return;
+  }
+  ADD_FAILURE() << "Expected to throw exception containing \""
+                << expectMessageContains << "\" but didn't throw";
+}
+template <class T, size_t N>
+void expectListEquals(c10::ArrayRef<T> expected, std::array<T, N> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual[i]);
+  }
+}
+template <class T>
+void expectListEquals(c10::ArrayRef<T> expected, c10::ArrayRef<T> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual[i]);
+  }
+}
+template <class T>
+void expectListEquals(c10::ArrayRef<T> expected, c10::List<T> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual.get(i));
+  }
+}
+template <class T>
+void expectListEquals(c10::ArrayRef<T> expected, std::vector<T> actual) {
+  EXPECT_EQ(expected.size(), actual.size());
+  for (const auto i : c10::irange(expected.size())) {
+    EXPECT_EQ(expected[i], actual[i]);
+  }
+}
+// NB: This is not really sound, but all of the type sets constructed here
+// are singletons so it's fine
+static inline c10::DispatchKey extractDispatchKey(const at::Tensor& t) {
+  return legacyExtractDispatchKey(t.key_set());
+}
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/CppSignature.h ADDED Viewed

	@@ -0,0 +1,72 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/DispatchKeySet.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/Type.h>
+#include <typeindex>
+namespace c10::impl {
+// A CppSignature object holds RTTI information about a C++ function signature
+// at runtime and can compare them or get a debug-printable name.
+class TORCH_API CppSignature final {
+ public:
+  CppSignature(const CppSignature&) = default;
+  CppSignature(CppSignature&&) noexcept = default;
+  CppSignature& operator=(const CppSignature&) = default;
+  CppSignature& operator=(CppSignature&&) noexcept = default;
+  template <class FuncType>
+  static CppSignature make() {
+    // Normalize functors, lambdas, function pointers, etc. into the plain
+    // function type The first argument of the schema might be of type
+    // DispatchKeySet, in which case we remove it. We do this to guarantee that
+    // all CppSignature's for an operator will match, even if they're registered
+    // with different calling conventions.
+    // See Note [Plumbing Keys Through The Dispatcher]
+    using decayed_function_type =
+        typename c10::remove_DispatchKeySet_arg_from_func<
+            std::decay_t<FuncType>>::func_type;
+    return CppSignature(std::type_index(typeid(decayed_function_type)));
+  }
+  std::string name() const {
+    return c10::demangle(signature_.name());
+  }
+  friend bool operator==(const CppSignature& lhs, const CppSignature& rhs) {
+    if (lhs.signature_ == rhs.signature_) {
+      return true;
+    }
+    // Without RTLD_GLOBAL, the type_index comparison could yield false because
+    // they point to different instances of the RTTI data, but the types would
+    // still be the same. Let's check for that case too.
+    // Note that there still is a case where this might not work, i.e. when
+    // linking libraries of different compilers together, they might have
+    // different ways to serialize a type name. That, together with a missing
+    // RTLD_GLOBAL, would still fail this.
+    if (0 == strcmp(lhs.signature_.name(), rhs.signature_.name())) {
+      return true;
+    }
+    return false;
+  }
+ private:
+  explicit CppSignature(std::type_index signature)
+      : signature_(std::move(signature)) {}
+  std::type_index signature_;
+};
+inline bool operator!=(const CppSignature& lhs, const CppSignature& rhs) {
+  return !(lhs == rhs);
+}
+} // namespace c10::impl
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/DispatchKeyExtractor.h ADDED Viewed

	@@ -0,0 +1,285 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/Variadic.h>
+#include <ATen/core/function_schema.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/Bitset.h>
+#include <c10/util/irange.h>
+#include <cstdint>
+namespace c10 {
+namespace impl {
+// Take a DispatchKeySet for a Tensor and determine what the actual dispatch
+// DispatchKey should be, taking into account TLS, and skipping backends which
+// fall through.
+//
+// Unlike Tensor::key_set(), the value of this on a tensor can change depending
+// on TLS.
+//
+// NB: If there is no valid dispatch key, this will return Undefined
+inline DispatchKeySet computeDispatchKeySet(
+    DispatchKeySet ks,
+    // The key mask lets us eliminate (by zero entries) keys which should not
+    // be considered for dispatch.  There are two cases when we use this:
+    //
+    // - If an operator's dispatch table contains a fallthrough entry, we
+    //   should bypass it entirely when finding the key
+    // - If a user invokes with redispatch, the mask lets us
+    //   zero out the key the user asked us to stop.
+    //
+    // These excluded backends are NOT tracked in the TLS, but must be applied
+    // AFTER TLS (since the backend may have been introduced for consideration
+    // by the included TLS), which is why you have to pass them in to this
+    // function (as opposed to just applying it to the input 'ks').
+    DispatchKeySet key_mask) {
+  c10::impl::LocalDispatchKeySet local =
+      c10::impl::tls_local_dispatch_key_set();
+  // TODO: It's a bit irritating that we have to do logical ORs here, it would
+  // be nice to only do one.  Can always_included be folded into the TLS?  Well,
+  // it's a bit troublesome, because fastpath TLS access requires the type of
+  // the TLS in question to be zero-initialized, so you don't actually win
+  // anything in that case.
+  return (((ks | local.included_) - local.excluded_) & key_mask);
+}
+} // namespace impl
+namespace detail {
+// A small gadget to extract the DispatchKeySet from types which are known
+// to have it.  Used to extract dispatch keys from unboxed calls.
+struct MultiDispatchKeySet : at::IterArgs<MultiDispatchKeySet> {
+  DispatchKeySet ts;
+  void operator()(const at::Tensor& x) {
+    ts = ts | x.key_set();
+  }
+  void operator()(const std::optional<at::Tensor>& x) {
+    if (x.has_value()) {
+      ts = ts | x->key_set();
+    }
+  }
+  void operator()(at::ArrayRef<at::Tensor> xs) {
+    for (const auto& x : xs) {
+      ts = ts | x.key_set();
+    }
+  }
+  // Tensor?[] translates to this case.
+  void operator()(const c10::List<std::optional<at::Tensor>>& xs) {
+    for (std::optional<at::Tensor> x : xs) {
+      if (x.has_value()) {
+        ts = ts | x.value().key_set();
+      }
+    }
+  }
+  // Structured Tensor[] translates to this case
+  void operator()(const at::ITensorListRef& xs) {
+    for (const auto& x : xs) {
+      ts = ts | x.key_set();
+    }
+  }
+  [[noreturn]] void operator()(
+      at::ArrayRef<std::optional<at::Tensor>> /*unused*/) {
+    // Just checking that the handling of Tensor?[] didn't change.
+    TORCH_INTERNAL_ASSERT(false);
+  }
+  void operator()(const at::Generator& gen) {
+    if (gen.defined()) {
+      ts = ts | gen.key_set();
+    }
+  }
+  void operator()(const std::optional<at::Generator>& gen) {
+    if (gen.has_value() && gen->defined()) {
+      ts = ts | gen->key_set();
+    }
+  }
+  template <typename T>
+  void operator()(const T& /*unused*/) {
+    // do nothing
+  }
+};
+// NB: take by const reference (Don't do universal forwarding here! You
+// don't want to move into this function!)
+template <typename... Args>
+DispatchKeySet multi_dispatch_key_set(const Args&... args) {
+  return MultiDispatchKeySet().apply(args...).ts;
+}
+} // namespace detail
+/**
+ * An instance of DispatchKeyExtractor knows how to get a dispatch key given
+ * a list of arguments for an operator call.
+ *
+ * The instance is specific for a certain operator as:
+ *  - In boxed dispatch, different operators have different ways to extract
+ *    the dispatch key (e.g. different numbers of arguments), and we precompute
+ *    the stack locations we should look at; and
+ *  - In all dispatch, some backends should be excluded from dispatch because
+ *    they have been registered as fallthrough.  The set of excluded backends
+ *    varies from operator, as some operators may have overridden the
+ *    fallthrough with custom behavior.
+ *
+ *   Note - this should maintain identical impl to the py dispatcher key
+ * extraction logic at pytorch/torch/dispatcher.py
+ */
+struct TORCH_API DispatchKeyExtractor final {
+ public:
+  static DispatchKeyExtractor make(const FunctionSchema& schema) {
+    return DispatchKeyExtractor(makeBitsetForDispatchArgs(schema));
+  }
+  static DispatchKeyExtractor makeUninitialized() {
+    return DispatchKeyExtractor(c10::utils::bitset());
+  }
+  void registerSchema(const FunctionSchema& schema) {
+    TORCH_INTERNAL_ASSERT(dispatch_arg_indices_reverse_.is_entirely_unset());
+    dispatch_arg_indices_reverse_ = makeBitsetForDispatchArgs(schema);
+  }
+  void deregisterSchema() {
+    dispatch_arg_indices_reverse_ = c10::utils::bitset();
+  }
+  DispatchKeySet getDispatchKeySetBoxed(const torch::jit::Stack* stack) const {
+    DispatchKeySet ks;
+    dispatch_arg_indices_reverse_.for_each_set_bit([&](size_t
+                                                           reverse_arg_index) {
+      const auto& ivalue = torch::jit::peek(*stack, 0, reverse_arg_index + 1);
+      if (C10_LIKELY(ivalue.isTensor())) {
+        // NB: Take care not to introduce a refcount bump (there's
+        // no safe toTensorRef method, alas)
+        ks = ks | ivalue.unsafeToTensorImpl()->key_set();
+      } else if (C10_UNLIKELY(ivalue.isTensorList())) {
+        // NB: use toListRef as it doesn't induce refcount bumps
+        // (toTensorListRef is not a thing)
+        for (const auto& nv : ivalue.toListRef()) {
+          auto* tensor = nv.unsafeToTensorImpl();
+          ks = ks | tensor->key_set();
+        }
+      }
+      // Tensor?[] translates to a c10::List<IValue> so we need to peek inside
+      else if (C10_UNLIKELY(ivalue.isList())) {
+        for (const auto& elt : ivalue.toListRef()) {
+          if (elt.isTensor()) {
+            ks = ks | elt.toTensor().key_set();
+          }
+        }
+      }
+    });
+    // Keys that are fallthrough should be skipped
+    if (requiresBitsetPerBackend_) {
+      c10::impl::LocalDispatchKeySet tls =
+          c10::impl::tls_local_dispatch_key_set();
+      auto backend_idx =
+          ((ks | tls.included_) - tls.excluded_).getBackendIndex();
+      return impl::computeDispatchKeySet(
+          ks, nonFallthroughKeysPerBackend_[backend_idx]);
+    } else {
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    }
+  }
+  template <class... Args>
+  DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const {
+    auto ks = detail::multi_dispatch_key_set(args...);
+    // Keys that are fallthrough should be skipped
+    if (requiresBitsetPerBackend_) {
+      c10::impl::LocalDispatchKeySet tls =
+          c10::impl::tls_local_dispatch_key_set();
+      auto backend_idx =
+          ((ks | tls.included_) - tls.excluded_).getBackendIndex();
+      return impl::computeDispatchKeySet(
+          ks, nonFallthroughKeysPerBackend_[backend_idx]);
+    } else {
+      return impl::computeDispatchKeySet(ks, nonFallthroughKeys_);
+    }
+  }
+  void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough);
+  std::string dumpState() const;
+  void checkInvariants(const FunctionSchema& schema) const;
+ private:
+  static bool isDispatchType(const Type& type) {
+    // Checking isSubtypeOf on a DynamicType heap-allocates a
+    // DynamicType version of the argument if it's not a DynamicType
+    // already, and this has measurable overhead during startup.
+#ifdef C10_MOBILE
+    struct CachedTypes {
+      DynamicTypePtr listOfTensors;
+      DynamicTypePtr listOfOptionalTensors;
+      DynamicTypePtr optionalOfTensor;
+    };
+    static const CachedTypes ct = {
+        DynamicType::create(*ListType::ofTensors()),
+        DynamicType::create(*ListType::ofOptionalTensors()),
+        DynamicType::create(*OptionalType::ofTensor())};
+    return type.isSubtypeOf(c10::TypeFactory::get<TensorType>()) ||
+        type.isSubtypeOf(ct.listOfTensors) ||
+        type.isSubtypeOf(ct.listOfOptionalTensors) ||
+        type.isSubtypeOf(ct.optionalOfTensor);
+#else // C10_MOBILE
+    return type.isSubtypeOf(*TensorType::get()) ||
+        type.isSubtypeOf(*ListType::ofTensors()) ||
+        type.isSubtypeOf(*ListType::ofOptionalTensors()) ||
+        type.isSubtypeOf(*OptionalType::ofTensor());
+#endif // C10_MOBILE
+  }
+  static c10::utils::bitset makeBitsetForDispatchArgs(
+      const FunctionSchema& schema) {
+    TORCH_CHECK(
+        schema.arguments().size() <= c10::utils::bitset::NUM_BITS(),
+        "The function schema has ",
+        schema.arguments().size(),
+        " arguments but this PyTorch build only supports ",
+        c10::utils::bitset::NUM_BITS());
+    c10::utils::bitset dispatch_arg_indices_reverse;
+    for (const auto index : c10::irange(schema.arguments().size())) {
+      if (isDispatchType(*schema.arguments()[index].type())) {
+        dispatch_arg_indices_reverse.set(schema.arguments().size() - 1 - index);
+      }
+    }
+    return dispatch_arg_indices_reverse;
+  }
+  explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse)
+      : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse),
+        nonFallthroughKeys_(DispatchKeySet::FULL) {
+    for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
+      nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL;
+    }
+  }
+  // this is a bitset that has ones for each argument index which has to be
+  // considered for dispatch. This avoids having to iterate over the stack
+  // to find all the tensors. The bits are stored in reverse order, i.e.
+  // dispatch_arg_indices_reverse_[i] == true, then the i-th argument from
+  // the top of the stack (i.e. the i-th last argument of the function)
+  // is relevant for dispatch.
+  // dispatch_arg_indices_reverse_ is allowed to have zero bits set; that just
+  // means you must do the fallthrough
+  c10::utils::bitset dispatch_arg_indices_reverse_;
+  // Set of functionality keys for which the operator does NOT have fallthrough
+  // kernel.
+  DispatchKeySet nonFallthroughKeys_;
+  // Set of functionality keys for which the operator does NOT have fallthrough
+  // kernel, defined PER BACKEND. This is only needed if we know that the
+  // operator has a different set of fallthroughs defined for some backends.
+  std::array<DispatchKeySet, num_backends> nonFallthroughKeysPerBackend_;
+  // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast
+  // path), or if we need to fall back to the slower path and check
+  // nonFallthroughKeysPerBackend_
+  bool requiresBitsetPerBackend_{false};
+};
+} // namespace c10
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/Dispatcher.h ADDED Viewed

	@@ -0,0 +1,955 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/SequenceNumber.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/boxing/impl/boxing.h>
+#include <ATen/core/dispatch/CppSignature.h>
+#include <ATen/core/dispatch/OperatorEntry.h>
+#include <ATen/core/dispatch/RegistrationHandleRAII.h>
+#include <ATen/record_function.h>
+#include <c10/core/SafePyObject.h>
+#include <c10/util/Exception.h>
+#include <c10/util/LeftRight.h>
+#include <condition_variable>
+#include <list>
+#include <mutex>
+#include <type_traits>
+#include <ATen/core/enum_tag.h>
+#include <ATen/core/grad_mode.h>
+#ifndef NDEBUG
+#include <iostream>
+#endif
+namespace c10 {
+TORCH_API bool show_dispatch_trace();
+TORCH_API void dispatch_trace_nesting_incr();
+TORCH_API void dispatch_trace_nesting_decr();
+TORCH_API int64_t dispatch_trace_nesting_value();
+struct DispatchTraceNestingGuard {
+  DispatchTraceNestingGuard() {
+    dispatch_trace_nesting_incr();
+  }
+  ~DispatchTraceNestingGuard() {
+    dispatch_trace_nesting_decr();
+  }
+};
+class TORCH_API OperatorHandle;
+template <class FuncType>
+class TypedOperatorHandle;
+/**
+ * Implement this interface and register your instance with the dispatcher
+ * to get notified when operators are registered or deregistered with
+ * the dispatcher.
+ *
+ * NB: registration events only occur when a 'def' occurs; we don't trigger
+ * on 'impl' or 'fallback' calls.
+ */
+class TORCH_API OpRegistrationListener {
+ public:
+  virtual ~OpRegistrationListener();
+  virtual void onOperatorRegistered(const OperatorHandle& op) = 0;
+  virtual void onOperatorDeregistered(const OperatorHandle& op) = 0;
+};
+namespace detail {
+class RegistrationListenerList;
+}
+class SchemaRegistrationHandleRAII;
+/**
+ * Top-level dispatch interface for dispatching via the dynamic dispatcher.
+ * Most end users shouldn't use this directly; if you're trying to register
+ * ops look in op_registration
+ */
+class TORCH_API Dispatcher final {
+ private:
+  // For direct access to backend fallback information
+  friend class impl::OperatorEntry;
+  struct OperatorDef final {
+    explicit OperatorDef(OperatorName&& op_name) : op(std::move(op_name)) {}
+    impl::OperatorEntry op;
+    // These refer to the number of outstanding RegistrationHandleRAII
+    // for this operator.  def_count reflects only def() registrations
+    // (in the new world, this should only ever be 1, but old style
+    // registrations may register the schema multiple times, which
+    // will increase this count).  def_and_impl_count reflects the number
+    // of combined def() and impl() registrations.  When the last def() gets
+    // unregistered, we must immediately call the Deregistered listeners, but we
+    // must not actually delete the handle as there are other outstanding RAII
+    // destructors which will try to destruct and they had better still have a
+    // working operator handle in this case
+    size_t def_count = 0;
+    size_t def_and_impl_count = 0;
+  };
+  friend class OperatorHandle;
+  template <class>
+  friend class TypedOperatorHandle;
+  struct Guard final {
+    Guard() : alive(true) {}
+    std::atomic<bool> alive;
+    std::mutex mutex;
+  };
+ public:
+  ~Dispatcher();
+  // Implementation note: this class abstracts over the fact that we have
+  // per-operator dispatch tables.  This could be easily adjusted to have a
+  // single global hash table.
+  static Dispatcher& realSingleton();
+  C10_ALWAYS_INLINE static Dispatcher& singleton() {
+#if !defined C10_MOBILE
+    // Implemented inline so that steady-state code needn't incur
+    // function-call overhead. We can't just inline `realSingleton`
+    // because the function-local static would get duplicated across
+    // all DSOs that include & use this header, leading to multiple
+    // singleton instances.
+    static Dispatcher& s = realSingleton();
+    return s;
+#else
+    // For C10_MOBILE, we should never inline a static function that
+    // has a static member, since the generated code calls
+    // __cxa_guard_acquire and __cxa_guard_release which help
+    // implement exactly once semantics for the initialization of the
+    // static Dispatcher& s above (for the non-mobile case). That
+    // additional code when duplicated across all operator stubs
+    // for every backend results in a lot of additional code
+    // being generated by the compiler.
+    return realSingleton();
+#endif
+  }
+  // ------------------------------------------------------------------------
+  //
+  // Accessing operators by schema
+  //
+  // ------------------------------------------------------------------------
+  /**
+   * Looks for an operator schema with the given name and overload name
+   * and returns it if it is registered WITH A SCHEMA.
+   * Returns nullopt otherwise.
+   */
+  std::optional<OperatorHandle> findSchema(const OperatorName& operator_name);
+  /**
+   * Variant of findSchema that results in less code generated at the call site.
+   * It (1) takes const char* pointer rather than OperatorName (so we skip
+   * generating std::string constructor calls at the call site), and (2)
+   * it raises an exception if the operator is not found (so we skip
+   * generating exception raising code at the call site)
+   *
+   * Irritatingly, we still have to generate the handful of instructions
+   * for dealing with an exception being thrown during static initialization
+   * (e.g. __cxa_guard_abort).  If we could annotate this method noexcept we
+   * could avoid this code too, but as the name of the function suggests,
+   * it does throw exceptions.
+   */
+  OperatorHandle findSchemaOrThrow(const char* name, const char* overload_name);
+  // Like findSchema, but also returns OperatorHandle even if there is no schema
+  std::optional<OperatorHandle> findOp(const OperatorName& operator_name);
+  // Returns a list of all operator names present in the operatorLookupTable_
+  const std::vector<OperatorName> getAllOpNames();
+  // Returns a list of all operator names present in the operatorLookupTable_
+  // for a given dispatch key
+  const std::vector<OperatorName> getAllOpNamesForDispatchKey(DispatchKey k);
+  // ------------------------------------------------------------------------
+  //
+  // Invoking operators
+  //
+  // ------------------------------------------------------------------------
+  template <class Return, class... Args>
+  Return call(const TypedOperatorHandle<Return(Args...)>& op, Args... args)
+      const;
+  template <class Return, class... Args>
+  static Return callWithDispatchKeySlowPath(
+      const TypedOperatorHandle<Return(Args...)>& op,
+      at::StepCallbacks& stepCallbacks,
+      DispatchKeySet dispatchKeySet,
+      const KernelFunction& kernel,
+      Args... args);
+  // Like call, but intended for use in a redispatch in kernels that have
+  // explicitly performed the DispatchKey update calculatulation. This will take
+  // the DispatchKeySet completely as is and dispatch to the kernel of the
+  // corresponding highest priority key in the set. Note that this version of
+  // redispatch treats the inputted DispatchKeySet *as is*, and does NOT mask
+  // out the highest priority key. See Note [Plumbing Keys Through The
+  // Dispatcher]
+  template <class Return, class... Args>
+  Return redispatch(
+      const TypedOperatorHandle<Return(Args...)>& op,
+      DispatchKeySet currentDispatchKeySet,
+      Args... args) const;
+  // Invoke an operator via the boxed calling convention using an IValue stack
+  void callBoxed(const OperatorHandle& op, Stack* stack) const;
+  void callBoxedForDispatchKey(
+      const OperatorHandle& op,
+      DispatchKey dk,
+      Stack* stack) const;
+  // TODO: This will only be useful if we write a backend fallback that plumbs
+  // dispatch keys (currently there are none) See Note [Plumbing Keys Through
+  // The Dispatcher]
+  void redispatchBoxed(
+      const OperatorHandle& op,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+  bool hasBackendFallbackForDispatchKey(DispatchKey dk) {
+    auto dispatch_ix = getDispatchTableIndexForDispatchKey(dk);
+    if (dispatch_ix < 0)
+      return false;
+    return backendFallbackKernels_[dispatch_ix].kernel.isValid();
+  }
+  // Used by torchdeploy/multipy for multiple  // codespell:ignore: multipy
+  // interpreters racing.
+  void waitForDef(const FunctionSchema& schema);
+  void waitForImpl(
+      const OperatorName& op_name,
+      std::optional<DispatchKey> dispatch_key);
+  // ------------------------------------------------------------------------
+  //
+  // Performing registrations (NON user public; use op_registration)
+  //
+  // ------------------------------------------------------------------------
+  /**
+   * Register a new operator schema.
+   *
+   * If a schema with the same operator name and overload name already exists,
+   * this function will check that both schemas are exactly identical.
+   */
+  RegistrationHandleRAII registerDef(
+      FunctionSchema schema,
+      std::string debug,
+      std::vector<at::Tag> tags = {});
+  /**
+   * Register a kernel to the dispatch table for an operator.
+   * If dispatch_key is nullopt, then this registers a fallback kernel.
+   *
+   * @return A RAII object that manages the lifetime of the registration.
+   *         Once that object is destructed, the kernel will be deregistered.
+   */
+  // NB: steals the inferred function schema, as we may need to hold on to
+  // it for a bit until the real schema turns up
+  RegistrationHandleRAII registerImpl(
+      OperatorName op_name,
+      std::optional<DispatchKey> dispatch_key,
+      KernelFunction kernel,
+      std::optional<impl::CppSignature> cpp_signature,
+      std::unique_ptr<FunctionSchema> inferred_function_schema,
+      std::string debug);
+  /**
+   * Given an operator, tells the Dispatcher that we have implemented a fake
+   * impl for this op in the given Python module. Call this a "pystub".
+   */
+  RegistrationHandleRAII registerPythonModule(
+      const OperatorName& op_name,
+      const char* pymodule,
+      const char* context);
+  /**
+   * Given an operator, throws if we have a pystub.
+   */
+  void throwIfHasPythonModule(OperatorName op_name);
+  std::optional<std::pair<const char*, const char*>> getPyStub(
+      OperatorName op_name);
+  /**
+   * Register a new operator by name.
+   */
+  RegistrationHandleRAII registerName(OperatorName op_name);
+  /**
+   * Register a fallback kernel for a backend.
+   * If an operator is called but there is no concrete kernel for the dispatch
+   * key of the given operator arguments, it will check if there is such a
+   * fallback kernel for the given dispatch key and, if yes, call that one.
+   */
+  RegistrationHandleRAII registerFallback(
+      DispatchKey dispatch_key,
+      KernelFunction kernel,
+      std::string debug);
+  /**
+   * Use to register whenever we had a TORCH_LIBRARY declaration in the frontend
+   * API.  These invocations are only permitted once per program, so we raise
+   * an error if this is called again for the same namespace.
+   */
+  RegistrationHandleRAII registerLibrary(std::string ns, std::string debug);
+  // ------------------------------------------------------------------------
+  //
+  // Listeners on registrations
+  //
+  // ------------------------------------------------------------------------
+  /**
+   * Add a listener that gets called whenever a new op is registered or an
+   * existing op is deregistered. Immediately after registering, this listener
+   * gets called for all previously registered ops, so it can be used to keep
+   * track of ops registered with this dispatcher.
+   */
+  RegistrationHandleRAII addRegistrationListener(
+      std::unique_ptr<OpRegistrationListener> listener);
+  void checkInvariants() const;
+  //
+  // ------------------------------------------------------------------------
+  //
+  // Assertions
+  //
+  // ------------------------------------------------------------------------
+  /**
+   * For testing purposes.
+   * Returns a list of all operators that were created through calls to
+   * registerImpl(), without any corresponding calls to registerDef(). After
+   * static initialization is done this is almost certainly a bug, as the
+   * created OperatorHandle won't have any schema associated with it and users
+   * calling the op through the dispatcher won't be able to access it
+   *
+   * Note that we cannot enforce this invariant "as we go" during static
+   * initialization, due to undefined static initialization order- we have no
+   * guarantees over the order in which .def() and .impl() calls are registered
+   * in the dispatcher at static initialization time. So this function should
+   * only be called after static initialization.
+   */
+  std::vector<OperatorHandle> findDanglingImpls() const;
+  /**
+   * Useful for inspecting global Dispatcher registration state.
+   * Returns the names of all operators with a kernel registered for the
+   * specified DispatchKey. If no DispatchKey is specified, it returns all
+   * registered operators.
+   */
+  std::vector<OperatorName> getRegistrationsForDispatchKey(
+      std::optional<DispatchKey> k) const;
+ private:
+  Dispatcher();
+  static int64_t sequenceNumberForRunningRecordFunction(
+      DispatchKey dispatchKey,
+      DispatchKeySet dispatchKeySet);
+  static void runRecordFunction(
+      at::RecordFunction& guard,
+      at::RecordFunction::schema_ref_t schema_ref,
+      DispatchKey dispatchKey,
+      DispatchKeySet dispatchKeySet);
+  static void runRecordFunction(
+      at::RecordFunction& guard,
+      at::RecordFunction::schema_ref_t schema_ref,
+      DispatchKey dispatchKey,
+      DispatchKeySet dispatchKeySet,
+      c10::ArrayRef<const c10::IValue> args);
+#ifdef FBCODE_CAFFE2
+  static bool profilingOperatorEvents();
+  static void fireOpStartUSDT(
+      at::RecordFunction::schema_ref_t schema_ref,
+      std::vector<void*>& argsAddresses,
+      std::vector<const char*>& argsTypes);
+  static void fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref);
+#endif // FBCODE_CAFFE2
+  OperatorHandle findOrRegisterSchema_(FunctionSchema&& schema);
+  OperatorHandle findOrRegisterName_(const OperatorName& op_name);
+  void deregisterDef_(const OperatorHandle& op, const OperatorName& op_name);
+  void deregisterImpl_(
+      const OperatorHandle& op,
+      const OperatorName& op_name,
+      std::optional<DispatchKey> dispatch_key,
+      impl::OperatorEntry::AnnotatedKernelContainerIterator kernel_handle);
+  void deregisterName_(const OperatorHandle& op, const OperatorName& op_name);
+  void deregisterFallback_(DispatchKey dispatchKey);
+  void deregisterLibrary_(const std::string& ns);
+  void cleanup(const OperatorHandle& op, const OperatorName& op_name);
+  void checkSchemaCompatibility(
+      const OperatorHandle& op,
+      const FunctionSchema& schema,
+      const std::string& debug);
+  std::list<OperatorDef> operators_;
+#if !defined(C10_MOBILE)
+  LeftRight<ska::flat_hash_map<OperatorName, OperatorHandle>>
+      operatorLookupTable_;
+#else
+  RWSafeLeftRightWrapper<ska::flat_hash_map<OperatorName, OperatorHandle>>
+      operatorLookupTable_;
+#endif
+  // Map from namespace to debug string (saying, e.g., where the library was
+  // defined)
+  ska::flat_hash_map<std::string, std::string> libraries_;
+  std::array<impl::AnnotatedKernel, num_runtime_entries>
+      backendFallbackKernels_;
+  std::unique_ptr<detail::RegistrationListenerList> listeners_;
+  // This condition variable gets notified whenever we add a new def/impl to the
+  // dispatch table.  This is primarily used by multiply/torchdeploy, when
+  // we have multiple interpreters trying to register to the dispatch table.
+  // In this situation, whenever the non-primary interpreter would have tried
+  // to register to the dispatch table, instead it will check to see if the
+  // expected registration has already been made, and if it hasn't, wait on
+  // this condition variable to see if it was just racing with the primary
+  // interpreter.
+  //
+  // We expect it to be rare for there to be any waiters on this condition
+  // variable.  This is mostly just to help give better diagnostics if
+  // something goes horribly wrong
+  std::condition_variable cond_var_;
+  // Protect concurrent access to the dispatcher.  We store this in a
+  // `shared_ptr` as we return callbacks that call back into dispatcher methods,
+  // and we need to be able to handle and guard against the event when the
+  // `Dispatcher` has been destroyed before the callbacks fire.
+  std::shared_ptr<Guard> guard_;
+};
+/**
+ * This is a handle to an operator schema registered with the dispatcher.
+ * This handle can be used to register kernels with the dispatcher or
+ * to lookup a kernel for a certain set of arguments.
+ */
+class TORCH_API OperatorHandle {
+  template <typename T>
+  friend struct std::hash;
+ public:
+  OperatorHandle(OperatorHandle&&) noexcept = default;
+  OperatorHandle& operator=(OperatorHandle&&) noexcept = default;
+  OperatorHandle(const OperatorHandle&) = default;
+  OperatorHandle& operator=(const OperatorHandle&) = default;
+  // NOLINTNEXTLINE(performance-trivially-destructible)
+  ~OperatorHandle();
+  const OperatorName& operator_name() const {
+    return operatorDef_->op.operator_name();
+  }
+  bool hasSchema() const {
+    return operatorDef_->op.hasSchema();
+  }
+  const FunctionSchema& schema() const {
+    return operatorDef_->op.schema();
+  }
+  const std::string& debug() const {
+    return operatorDef_->op.debug();
+  }
+  std::string dumpState() const {
+    return operatorDef_->op.dumpState();
+  }
+  bool hasKernelForDispatchKey(DispatchKey k) const {
+    return operatorDef_->op.hasKernelForDispatchKey(k);
+  }
+  bool isKernelFallthroughKernel(DispatchKey k) const {
+    return operatorDef_->op.kernelForDispatchKey(k).isFallthrough();
+  }
+  bool hasKernelForAnyDispatchKey(DispatchKeySet k) const {
+    return operatorDef_->op.hasKernelForAnyDispatchKey(k);
+  }
+  bool hasComputedKernelForDispatchKey(DispatchKey k) const {
+    return operatorDef_->op.hasComputedKernelForDispatchKey(k);
+  }
+  SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const {
+    return operatorDef_->op.getComputedKernelForDispatchKey(k);
+  }
+  std::string dumpComputedTable() const {
+    return operatorDef_->op.dumpComputedTable();
+  }
+  void checkInvariants() const {
+    operatorDef_->op.checkInvariants();
+  }
+  c10::ArrayRef<at::Tag> getTags() const {
+    return operatorDef_->op.getTags();
+  }
+  void setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> callback) {
+    operatorDef_->op.setReportErrorCallback_(std::move(callback));
+  }
+  bool hasTag(const at::Tag& tag) const {
+    for (const auto& tag_ : getTags()) {
+      if (tag == tag_) {
+        return true;
+      }
+    }
+    return false;
+  }
+  template <class FuncType>
+  TypedOperatorHandle<FuncType> typed() const {
+    // NB: This assert is not 100% sound: you can retrieve a typed() operator
+    // handle prior to ANY C++ signature being registered on the operator
+    // and the check will say everything is OK (at which point you can then
+    // smuggle in a kernel that is typed incorrectly).  For everything
+    // in core library this won't happen, because all the static registrations
+    // will be done by the time a typed() handle is acquired.
+#if !defined C10_MOBILE
+    operatorDef_->op.assertSignatureIsCorrect<FuncType>();
+    if (fn_has_symint<FuncType>::value) {
+      operatorDef_->op.assertSignatureIsCorrect<
+          typename fn_remove_symint<FuncType>::type>();
+    }
+#endif
+    return TypedOperatorHandle<FuncType>(operatorIterator_);
+  }
+  void callBoxed(Stack* stack) const {
+    c10::Dispatcher::singleton().callBoxed(*this, stack);
+  }
+  void callBoxed(Stack& stack) const {
+    callBoxed(&stack);
+  }
+  void callBoxedForDispatchKey(DispatchKey dk, Stack& stack) const {
+    c10::Dispatcher::singleton().callBoxedForDispatchKey(*this, dk, &stack);
+  }
+  void redispatchBoxed(DispatchKeySet ks, Stack* stack) const {
+    c10::Dispatcher::singleton().redispatchBoxed(*this, ks, stack);
+  }
+  template <typename F>
+  PyObject* getPythonOp(
+      c10::impl::PyInterpreter* self_interpreter,
+      F slow_accessor) const {
+    return operatorDef_->op.getPythonOp(self_interpreter, slow_accessor);
+  }
+  bool operator==(const OperatorHandle& other) const {
+    return operatorDef_ == other.operatorDef_;
+  }
+  bool operator!=(const OperatorHandle& other) const {
+    return operatorDef_ != other.operatorDef_;
+  }
+ private:
+  explicit OperatorHandle(
+      std::list<Dispatcher::OperatorDef>::iterator operatorIterator)
+      : operatorDef_(&*operatorIterator), operatorIterator_(operatorIterator) {}
+  friend class Dispatcher;
+  template <class>
+  friend class TypedOperatorHandle;
+  // Storing a direct pointer to the OperatorDef even though we
+  // already have the iterator saves an instruction in the critical
+  // dispatch path. The iterator is effectively a
+  // pointer-to-std::list-node, and (at least in libstdc++'s
+  // implementation) the element is at an offset 16 bytes from that,
+  // because the prev/next pointers come first in the list node
+  // struct. So, an add instruction would be necessary to convert from the
+  // iterator to an OperatorDef*.
+  Dispatcher::OperatorDef* operatorDef_;
+  // We need to store this iterator in order to make
+  // Dispatcher::cleanup() fast -- it runs a lot on program
+  // termination (and presumably library unloading).
+  std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
+};
+/**
+ * This is a handle to an operator schema registered with the dispatcher.
+ * It holds the same information as an OperatorHandle, but it is templated
+ * on the operator arguments and allows calling the operator in an
+ * unboxed way.
+ */
+template <class FuncType>
+class TypedOperatorHandle final {
+  static_assert(
+      guts::false_t<FuncType>(),
+      "FuncType in OperatorHandle::typed<FuncType> was not a valid function type");
+};
+template <class Return, class... Args>
+class TypedOperatorHandle<Return(Args...)> final : public OperatorHandle {
+ public:
+  TypedOperatorHandle(TypedOperatorHandle&&) noexcept = default;
+  TypedOperatorHandle& operator=(TypedOperatorHandle&&) noexcept = default;
+  TypedOperatorHandle(const TypedOperatorHandle&) = default;
+  TypedOperatorHandle& operator=(const TypedOperatorHandle&) = default;
+  // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use
+  // &&
+  C10_ALWAYS_INLINE Return call(Args... args) const {
+    return c10::Dispatcher::singleton().call<Return, Args...>(
+        *this, std::forward<Args>(args)...);
+  }
+  // See [Note: Argument forwarding in the dispatcher] for why Args doesn't use
+  // &&
+  C10_ALWAYS_INLINE Return
+  redispatch(DispatchKeySet currentDispatchKeySet, Args... args) const {
+    return c10::Dispatcher::singleton().redispatch<Return, Args...>(
+        *this, currentDispatchKeySet, std::forward<Args>(args)...);
+  }
+ private:
+  explicit TypedOperatorHandle(
+      std::list<Dispatcher::OperatorDef>::iterator operatorIterator)
+      : OperatorHandle(operatorIterator) {}
+  friend class OperatorHandle;
+};
+namespace detail {
+template <class... Args>
+inline void unused_arg_(const Args&... /*unused*/) {}
+// CaptureKernelCall is intended to capture return values from Dispatcher
+// unboxed kernel calls. A record function may request to get outputs from the
+// kernel calls. For boxed kernels, it's straightforward, the returned values
+// are in the stack object. The stack can be passed to record functions. For
+// unboxed kernels, we need to handle different kinds of return values, cache
+// them temporarily, then release the values for the actual function call
+// return.
+template <typename ReturnType>
+struct CaptureKernelCall {
+  template <typename F, typename... Args>
+  CaptureKernelCall(
+      const F& kernel,
+      const TypedOperatorHandle<ReturnType(Args...)>& op,
+      const DispatchKeySet& dispatchKeySet,
+      Args&&... args)
+      // Calls the kernel and capture the result in output_.
+      : output_{kernel.template call<ReturnType, Args...>(
+            op,
+            dispatchKeySet,
+            std::forward<Args>(args)...)} {}
+  // Wraps the return values in a Stack.
+  Stack getOutputs() {
+    Stack stack;
+    impl::push_outputs<ReturnType, false>::copy(output_, &stack);
+    return stack;
+  }
+  // Since we are returning the output_, we don't expect the output_ to be used
+  // afterward. Copy elision and RVO do not apply to class data members. Using
+  // move semantic to avoid copies when possible.
+  ReturnType release() && {
+    return std::move(output_);
+  }
+ private:
+  ReturnType output_;
+};
+// Handle the lvalue reference differently since it should not be moved.
+template <>
+inline at::Tensor& CaptureKernelCall<at::Tensor&>::release() && {
+  return output_;
+}
+// Handle case where the kernel returns void.
+template <>
+struct CaptureKernelCall<void> {
+  template <typename F, typename... Args>
+  CaptureKernelCall(
+      const F& kernel,
+      const TypedOperatorHandle<void(Args...)>& op,
+      const DispatchKeySet& dispatchKeySet,
+      Args&&... args) {
+    // Calling the kernel and no need to capture void.
+    kernel.template call<void, Args...>(
+        op, dispatchKeySet, std::forward<Args>(args)...);
+  }
+  Stack getOutputs() {
+    return Stack();
+  }
+  void release() && {}
+};
+TORCH_API void _print_dispatch_trace(
+    const std::string& label,
+    const std::string& op_name,
+    const DispatchKeySet& dispatchKeySet);
+} // namespace detail
+// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+template <class Return, class... Args>
+inline Return Dispatcher::callWithDispatchKeySlowPath(
+    const TypedOperatorHandle<Return(Args...)>& op,
+    at::StepCallbacks& stepCallbacks,
+    DispatchKeySet dispatchKeySet,
+    const KernelFunction& kernel,
+    Args... args) {
+  // If callbacks need inputs, we box the arguments and pass them to the guard.
+  // Note: For perf reasons we wouldn't want to prematurely box the arguments.
+  at::RecordFunction guard(std::move(stepCallbacks));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(op.operatorDef_->op.isObserved());
+  auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
+  auto& schema = op.schema();
+  auto schema_ref = std::reference_wrapper<const FunctionSchema>(schema);
+  constexpr auto num_boxed_args = impl::boxed_size<Args...>();
+  if constexpr (num_boxed_args != 0) {
+    if (guard.needsInputs()) {
+      // If we used std::array<IValue, num_boxed_args> here, we would
+      // have to spend time default constructing the IValues in
+      // boxedArgs. aligned_storage has no such requirement.
+      // NOLINTNEXTLINE(*array*)
+      alignas(IValue) std::byte boxedArgs[num_boxed_args * sizeof(IValue)];
+      // For debugging only; could be removed (but the compiler will do
+      // that for us and it's nice to have the extra assurance of
+      // correctness from our debug builds).
+      IValue* boxedArgsPtr = reinterpret_cast<IValue*>(boxedArgs);
+      impl::boxArgsToStack(boxedArgsPtr, args...);
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+          reinterpret_cast<std::byte*>(boxedArgsPtr) ==
+          boxedArgs + num_boxed_args * sizeof(IValue));
+      // I don't *think* we need std::launder here, because IValue has
+      // no subclasses and no const or reference fields.
+      runRecordFunction(
+          guard,
+          schema_ref,
+          dispatchKey,
+          dispatchKeySet,
+          c10::ArrayRef<const c10::IValue>(
+              reinterpret_cast<IValue*>(boxedArgs), num_boxed_args));
+      boxedArgsPtr = reinterpret_cast<IValue*>(boxedArgs);
+      for (size_t ii = 0; ii < num_boxed_args; ++ii) {
+        (boxedArgsPtr + ii)->~IValue();
+      }
+    } else {
+      runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet);
+    }
+  } else {
+    runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet);
+  }
+  if (C10_UNLIKELY(guard.needsOutputs())) {
+    // Calls the kernel and capture the output temporarily to pass to
+    // RecordFunction.
+    detail::CaptureKernelCall<Return> captureKernelCall(
+        kernel, op, dispatchKeySet, std::forward<Args>(args)...);
+    guard.setOutputs(captureKernelCall.getOutputs());
+    // Releases the captured output to return to caller.
+    return std::move(captureKernelCall).release();
+  }
+  // keeping the guard alive while executing the kernel
+  return kernel.template call<Return, Args...>(
+      op, dispatchKeySet, std::forward<Args>(args)...);
+}
+// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+template <class Return, class... Args>
+C10_ALWAYS_INLINE_UNLESS_MOBILE Return Dispatcher::call(
+    const TypedOperatorHandle<Return(Args...)>& op,
+    Args... args) const {
+  auto dispatchKeySet =
+      op.operatorDef_->op.dispatchKeyExtractor()
+          .template getDispatchKeySetUnboxed<Args...>(args...);
+#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG)
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+    detail::_print_dispatch_trace(
+        "[call]", toString(op.operator_name()), dispatchKeySet);
+  }
+#endif
+  const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet);
+#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
+  auto step_callbacks =
+      at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION);
+  if (C10_UNLIKELY(
+          step_callbacks.has_value() && op.operatorDef_->op.isObserved())) {
+    return callWithDispatchKeySlowPath<Return, Args...>(
+        op,
+        *step_callbacks,
+        dispatchKeySet,
+        kernel,
+        std::forward<Args>(args)...);
+  }
+#endif // PYTORCH_DISABLE_PER_OP_PROFILING
+#ifdef FBCODE_CAFFE2
+  if (profilingOperatorEvents()) {
+    std::vector<void*> argsAddresses = {(void*)(&args)...};
+    std::vector<const char*> argsTypes = {(typeid(args).name())...};
+    struct FireOpRAII {
+      FireOpRAII(
+          at::RecordFunction::schema_ref_t schema_ref,
+          std::vector<void*>& argsAddresses,
+          std::vector<const char*>& argsTypes)
+          : schema_ref_(schema_ref) {
+        fireOpStartUSDT(schema_ref, argsAddresses, argsTypes);
+      }
+      ~FireOpRAII() {
+        fireOpEndUSDT(schema_ref_);
+      }
+      at::RecordFunction::schema_ref_t schema_ref_;
+    } event(op.schema(), argsAddresses, argsTypes);
+    return kernel.template call<Return, Args...>(
+        op, dispatchKeySet, std::forward<Args>(args)...);
+  } else {
+    return kernel.template call<Return, Args...>(
+        op, dispatchKeySet, std::forward<Args>(args)...);
+  }
+#else
+  return kernel.template call<Return, Args...>(
+      op, dispatchKeySet, std::forward<Args>(args)...);
+#endif // FBCODE_CAFFE2
+}
+// See [Note: Argument forwarding in the dispatcher] for why Args doesn't use &&
+template <class Return, class... Args>
+inline Return Dispatcher::redispatch(
+    const TypedOperatorHandle<Return(Args...)>& op,
+    DispatchKeySet currentDispatchKeySet,
+    Args... args) const {
+  // do not use RecordFunction on redispatch
+#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG)
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+    detail::_print_dispatch_trace(
+        "[redispatch]", toString(op.operator_name()), currentDispatchKeySet);
+  }
+#endif
+  const KernelFunction& kernel =
+      op.operatorDef_->op.lookup(currentDispatchKeySet);
+  return kernel.template call<Return, Args...>(
+      op, currentDispatchKeySet, std::forward<Args>(args)...);
+}
+inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack)
+    const {
+  // note: this doesn't need the mutex because write operations on the list keep
+  // iterators intact.
+  const auto& entry = op.operatorDef_->op;
+  auto dispatchKeySet =
+      entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
+#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG)
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+    detail::_print_dispatch_trace(
+        "[callBoxed]", toString(op.operator_name()), dispatchKeySet);
+  }
+#endif
+  const auto& kernel = entry.lookup(dispatchKeySet);
+#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
+  auto step_callbacks =
+      at::getStepCallbacksUnlessEmpty(at::RecordScope::FUNCTION);
+  if (C10_UNLIKELY(step_callbacks.has_value() && entry.isObserved())) {
+    at::RecordFunction guard(std::move(*step_callbacks));
+    auto dispatchKey = dispatchKeySet.highestPriorityTypeId();
+    auto& schema = op.schema();
+    auto schema_ref = std::reference_wrapper<const FunctionSchema>(schema);
+    guard.needsInputs()
+        ? runRecordFunction(
+              guard,
+              schema_ref,
+              dispatchKey,
+              dispatchKeySet,
+              c10::ArrayRef<const c10::IValue>(stack->data(), stack->size()))
+        : runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet);
+    // keeping the guard alive while executing the kernel
+    kernel.callBoxed(op, dispatchKeySet, stack);
+    if (C10_UNLIKELY(guard.needsOutputs())) {
+      guard.setOutputs(*stack);
+    }
+    return;
+  }
+#endif // PYTORCH_DISABLE_PER_OP_PROFILING
+  kernel.callBoxed(op, dispatchKeySet, stack);
+}
+// NB: this doesn't count as a "true" dispatcher jump, so no instrumentation
+inline void Dispatcher::callBoxedForDispatchKey(
+    const OperatorHandle& op,
+    DispatchKey dk,
+    Stack* stack) const {
+  // note: this doesn't need the mutex because write operations on the list keep
+  // iterators intact.
+  const auto& entry = op.operatorDef_->op;
+  // We still compute this as we're obligated to pass it on to the internal
+  // kernel, if it is a boxed fallback
+  auto dispatchKeySet =
+      entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack);
+  const auto& kernel = ([&]() {
+    if (op.hasKernelForDispatchKey(dk)) {
+      return entry.kernelForDispatchKey(dk);
+    } else {
+      auto idx = getDispatchTableIndexForDispatchKey(dk);
+      TORCH_INTERNAL_ASSERT(idx >= 0);
+      return backendFallbackKernels_[idx].kernel;
+    }
+  })();
+  kernel.callBoxed(op, dispatchKeySet, stack);
+}
+inline void Dispatcher::redispatchBoxed(
+    const OperatorHandle& op,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  // note: this doesn't need the mutex because write operations on the list keep
+  // iterators intact.
+  const auto& entry = op.operatorDef_->op;
+#if defined(HAS_TORCH_SHOW_DISPATCH_TRACE) || !defined(NDEBUG)
+  DispatchTraceNestingGuard debug_guard;
+  if (show_dispatch_trace()) {
+    detail::_print_dispatch_trace(
+        "[redispatchBoxed]", toString(op.operator_name()), dispatchKeySet);
+  }
+#endif
+  const auto& kernel = entry.lookup(dispatchKeySet);
+  kernel.callBoxed(op, dispatchKeySet, stack);
+}
+} // namespace c10
+namespace std {
+template <>
+struct hash<c10::OperatorHandle> {
+  size_t operator()(const c10::OperatorHandle& op) const noexcept {
+    return std::hash<void*>{}(static_cast<void*>(op.operatorDef_));
+  }
+};
+} // namespace std
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/ObservedOperators.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/operator_name.h>
+#include <string>
+#include <unordered_set>
+namespace c10 {
+struct TORCH_API ObservedOperators {
+  ObservedOperators() = delete;
+  static bool isObserved(const OperatorName& name);
+  static std::unordered_set<std::string>& getUnobservedOperatorList();
+};
+} // namespace c10
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorEntry.h ADDED Viewed

	@@ -0,0 +1,342 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/DispatchKeyExtractor.h>
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/PyHandleCache.h>
+#include <c10/core/SafePyObject.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/flat_hash_map.h>
+#include <ATen/core/dispatch/CppSignature.h>
+#include <ATen/core/dispatch/OperatorOptions.h>
+#include <ATen/core/dispatch/RegistrationHandleRAII.h>
+#include <ATen/core/enum_tag.h>
+#include <array>
+#include <list>
+#include <optional>
+#ifdef C10_MOBILE
+#define C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
+#endif
+namespace c10 {
+class Dispatcher;
+namespace impl {
+// This data structure represents a kernel that was registered to us from a
+// user.  Unlike KernelFunction, AnnotatedKernel contains some extra metadata
+// about the kernel that isn't necessary for actual dispatching (this is why
+// we don't put AnnotatedKernel in the actual DispatchTable), but is useful for
+// giving good error messages.
+struct AnnotatedKernel final {
+  AnnotatedKernel(
+      KernelFunction k,
+      std::unique_ptr<FunctionSchema> s,
+      std::string d)
+      : kernel(std::move(k)),
+        inferred_function_schema(std::move(s)),
+        debug(std::move(d)) {}
+  AnnotatedKernel() = default;
+  KernelFunction kernel;
+  std::unique_ptr<FunctionSchema> inferred_function_schema;
+  // A little debug string to help us identify the kernel in question.
+  // Most importantly it records the TORCH_LIBRARY block that did the
+  // registration.
+  std::string debug;
+};
+// This data structure represents operator schema, with metadata specifying
+// where the registration of this schema occurred
+struct AnnotatedSchema final {
+  AnnotatedSchema(FunctionSchema s, std::string d)
+      : schema(std::move(s)), debug(std::move(d)) {}
+  FunctionSchema schema;
+  std::string debug;
+};
+// Internal data structure that records information about a specific operator.
+// It's not part of the public API; typically, users will interact with
+// OperatorHandle instead.
+//
+// Concurrent writes to OperatorEntry are protected by the GLOBAL Dispatcher
+// lock (this is important because some methods in OperatorEntry access
+// dispatcher state)
+class TORCH_API OperatorEntry final {
+ public:
+  explicit OperatorEntry(OperatorName&& operator_name);
+  OperatorEntry(const OperatorEntry&) = delete;
+  OperatorEntry(OperatorEntry&&) noexcept = delete;
+  OperatorEntry& operator=(const OperatorEntry&) = delete;
+  OperatorEntry& operator=(OperatorEntry&&) noexcept = delete;
+  const FunctionSchema& schema() const {
+    TORCH_INTERNAL_ASSERT(
+        schema_.has_value(),
+        "Tried to access the schema for ",
+        name_,
+        " which doesn't have a schema registered yet");
+    return schema_->schema;
+  }
+  const std::string& debug() const {
+    TORCH_INTERNAL_ASSERT(schema_.has_value());
+    return schema_->debug;
+  }
+  bool hasSchema() const {
+    return schema_.has_value();
+  }
+  bool isObserved() const {
+    return is_observed_;
+  }
+  // We may allocate an OperatorEntry for an operator even when we don't
+  // have a schema.  When we receive the schema registration, we post
+  // facto register a schema.
+  //
+  // NB: registerSchema/deregisterSchema are not idempotent; if you
+  // attempt to register a schema when one is already present or vice
+  // versa that is an error.  (Refcounting for the registrations is
+  // handled in the OperatorHandle in Dispatcher)
+  void registerSchema(
+      FunctionSchema&& /*schema*/,
+      std::string&& debug,
+      std::vector<at::Tag> tags = {});
+  void deregisterSchema();
+  const OperatorName& operator_name() const {
+    return name_;
+  }
+#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
+  using AnnotatedKernelContainer = std::array<AnnotatedKernel, 1>;
+#else
+  using AnnotatedKernelContainer = std::list<AnnotatedKernel>;
+#endif
+  using AnnotatedKernelContainerIterator = AnnotatedKernelContainer::iterator;
+  // Why are kernels and fallback asymmetric?  It has to do with ownership.
+  // Kernels and the computed dispatch tables for them are canonically
+  // owned by OperatorEntry, but backend fallbacks are specified once
+  // and apply for all operators, so they should be owned by Dispatcher.
+  // However, the registration of a backend fallback affects the
+  // state of the computed dispatch table, so when a backend fallback
+  // is updated, we need to update the operator tables too.  Thus,
+  // registerKernel is the mechanism by which we give kernels to
+  // operator entry to own (and update dispatch table), but we only
+  // need a non-owning mechanism to update fallback.
+  // Precondition: Dispatcher::mutex_ is held
+  // Postcondition: caller is responsible for disposing of the kernel
+  AnnotatedKernelContainerIterator registerKernel(
+      const Dispatcher& dispatcher,
+      std::optional<DispatchKey> dispatch_key,
+      KernelFunction kernel,
+      std::optional<CppSignature> cpp_signature,
+      std::unique_ptr<FunctionSchema> inferred_function_schema,
+      std::string debug);
+  // Precondition: Dispatcher::mutex_ is held
+  void deregisterKernel_(
+      const Dispatcher& dispatcher,
+      std::optional<DispatchKey> dispatch_key,
+      AnnotatedKernelContainerIterator kernel);
+  // Precondition: Dispatcher::mutex_ is held
+  void updateFallback(const Dispatcher& dispatcher, DispatchKey dispatch_key);
+  // Precondition: Dispatcher::mutex_ is held
+  void updateSchemaAliasAnalysis(AliasAnalysisKind a) {
+    TORCH_INTERNAL_ASSERT(schema_.has_value());
+    schema_->schema.setAliasAnalysis(a);
+  }
+  std::string dumpComputedTable() const;
+  std::string dumpState() const;
+  void checkInvariants() const;
+  const DispatchKeyExtractor& dispatchKeyExtractor() const {
+    return dispatchKeyExtractor_;
+  }
+  // Asserts that the given FuncType is correct for calling this operator in an
+  // unboxed way.
+  template <class FuncType>
+  inline void assertSignatureIsCorrect() {
+    assertSignatureIsCorrect(
+        CppSignature::make<FuncType>(), fn_has_symint<FuncType>::value);
+  }
+  void assertSignatureIsCorrect(
+      const CppSignature& call_signature,
+      bool has_symint) const;
+  [[noreturn]] void reportError(DispatchKey dispatchKey) const;
+  const KernelFunction& lookup(DispatchKeySet ks) const {
+    const auto idx = ks.getDispatchTableIndexForDispatchKeySet();
+    if (C10_UNLIKELY(idx == -1)) {
+      reportError(ks.highestPriorityTypeId());
+    }
+    const auto& kernel = dispatchTable_[idx];
+    // A valid kernel *always* has a boxed kernel and *may* have an
+    // unboxed kernel. However, we typically do unboxed calls in at::
+    // APIs, where the kernel 1) will very likely be valid and 2)
+    // should have an unboxed kernel. Checking the unboxed kernel
+    // first will allow us to avoid touching the boxed kernel at all
+    // in the common case.
+    if (C10_UNLIKELY(!kernel.isValidUnboxed())) {
+      if (!kernel.isValid()) {
+        reportError(ks.highestPriorityTypeId());
+      }
+    }
+    return kernel;
+  }
+  std::string listAllDispatchKeys() const;
+  // Returns true if kernel_ has entry for any key in ks.
+  //
+  // Invariant: There are no alias keys in the passed-in dispatch key set.
+  // Note [No Alias Keys in DispatchKeySet]
+  // Alias keys should be checked using `hasKernelForDispatchKey`
+  // Alias keys shouldn't go inside of a DispatchKeySet, since they can
+  // technically have a value > 63 (causing overflow).
+  bool hasKernelForAnyDispatchKey(DispatchKeySet ks) const;
+  // Returns true if kernel_ has entry for a particular key.
+  bool hasKernelForDispatchKey(DispatchKey k) const;
+  // Retrieves the kernel entry at a particular key.  Symmetric with
+  // hasKernelForDispatchKey.  To get the AnnotatedKernel, see
+  // getKernelForDispatchKey (private)
+  const KernelFunction& kernelForDispatchKey(DispatchKey k) const;
+  // Returns true if the "computed table" has an entry for a particular key.
+  bool hasComputedKernelForDispatchKey(DispatchKey k) const;
+  // Returns a KernelFunction corresponding to the kernel in dispatchTable
+  SafeKernelFunction getComputedKernelForDispatchKey(DispatchKey k) const;
+  // Returns all the operator tags added at the time of registration
+  const std::vector<at::Tag>& getTags() const;
+  void setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> callback);
+  template <typename F>
+  PyObject* getPythonOp(PyInterpreter* self_interpreter, F slow_accessor)
+      const {
+    return py_cache_.ptr_or(self_interpreter, slow_accessor);
+  }
+ private:
+  OperatorName name_;
+  std::optional<AnnotatedSchema> schema_;
+#ifndef C10_MOBILE
+  std::vector<at::Tag> tags_;
+#endif
+  std::array<KernelFunction, c10::num_runtime_entries> dispatchTable_;
+  DispatchKeyExtractor dispatchKeyExtractor_;
+  // Pointer to the torch.ops.ns.op.overload object for speed
+  c10::PyHandleCache py_cache_;
+  // kernels_ stores all registered kernels for the corresponding dispatch key
+  // and catchAllKernels_ stores the catch-all kernels.
+  // If an operator library gets loaded that overwrites an already existing
+  // kernel, both kernels will be in that list but only the newer one will be in
+  // dispatchTable. If any of the kernels go away (say the library gets
+  // unloaded), we remove the kernel from this list and update the
+  // dispatchTable if necessary.
+  // Kernels in the list are ordered by registration time descendingly,
+  // newer registrations are before older registrations.
+  // We do not combine dispatchTable and kernels into one hash map because
+  // kernels is a larger data structure and accessed quite infrequently
+  // while dispatchTable is accessed often and should be kept small to fit
+  // into CPU caches.
+  // Invariants:
+  //  - dispatchTable[dispatch_key] == kernels_[dispatch_key].front()
+  //  - dispatchTable[dispatch_key] does not exist if and only if
+  //    kernels_[dispatch_key] does not exist
+  //  - If kernels_[dispatch_key] exists, then it has elements.
+  //    It is never an empty list.
+  //
+  // Why do we do that?
+  // -----
+  // We mostly do this to enable Jupyter notebooks where a cell registering
+  // a kernel could be executed multiple times and the later execution
+  // should overwrite the earlier one. Note that this still fails when the
+  // function schema changed between the executions, but it works as long
+  // as the function schema didn't change. A better solution would be to
+  // unload the old extension library from the Jupyter cell when the cell is
+  // re-executed and then only allow one kernel here, i.e. error if a kernel
+  // is already registered, but that's a lot of effort to implement and
+  // currently not high-pri.
+  ska::flat_hash_map<
+      DispatchKey,
+#ifdef C10_DISPATCHER_ONE_KERNEL_PER_DISPATCH_KEY
+      // On mobile, we needn't worry about Jupyter notebooks.
+      std::array<AnnotatedKernel, 1>
+#else
+      std::list<AnnotatedKernel>
+#endif
+      >
+      kernels_;
+  const AnnotatedKernel& missingKernel() const;
+  const AnnotatedKernel& ambiguousAutogradOtherKernel() const;
+  // cpp_signature_ stores function signature if any of
+  // the kernels was created in a way that allowed us to know the function
+  // signature (i.e. by supplying an unboxed C++ kernel function).
+  // If this is set, it will be used to check that future kernel
+  // registrations match and it will be used in unboxed function calls
+  // to verify their arguments against the known function signature.
+  struct CppSignatureWithDebug {
+    CppSignature signature;
+    std::string debug;
+    std::optional<DispatchKey> dispatch_key;
+  };
+  std::optional<CppSignatureWithDebug> cpp_signature_;
+  std::optional<CppSignatureWithDebug> sym_cpp_signature_;
+  // A Python custom error handler for OperatorEntry::reportError
+  std::unique_ptr<c10::SafePyObject> report_error_callback_;
+  // Whether this operator needs to be observed with RecordFunction
+  const bool is_observed_;
+  [[noreturn]] void reportSignatureError(
+      const CppSignature& call_signature,
+      const CppSignatureWithDebug& saved_signature) const;
+  const KernelFunction& computeDispatchTableEntry(
+      const c10::Dispatcher& dispatcher,
+      DispatchKey dispatch_key) const;
+  std::pair<const AnnotatedKernel&, const char*>
+  computeDispatchTableEntryWithDebug(
+      const c10::Dispatcher& dispatcher,
+      DispatchKey dispatch_key) const;
+  // This function re-establishes the invariant that dispatchTable
+  // contains the front element from the kernels list for a given runtime
+  // dispatch key.
+  void updateDispatchTableEntry_(
+      const c10::Dispatcher& dispatcher,
+      DispatchKey dispatch_key);
+  // Like above, but also handles alias dispatch keys.
+  void updateDispatchTable_(
+      const c10::Dispatcher& dispatcher,
+      DispatchKey dispatch_key);
+  // Like above, but for ALL entries in the dispatch table.
+  void updateDispatchTableFull_(const c10::Dispatcher& dispatcher);
+  // Retrieves a pointer to AnnotatedKernel at
+  // kernels_.at(dispatch_key).front().
+  const AnnotatedKernel* getKernelForDispatchKey(
+      DispatchKey dispatch_key) const;
+};
+} // namespace impl
+} // namespace c10
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/OperatorOptions.h ADDED Viewed

	@@ -0,0 +1,35 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <cstdint>
+namespace c10 {
+enum class AliasAnalysisKind : uint8_t {
+  INTERNAL_SPECIAL_CASE,
+  CONSERVATIVE, // The most conservative alias analysis type, assumes
+                // side-effects. This is the default analysis.
+  FROM_SCHEMA,
+  PURE_FUNCTION
+};
+#if !defined(_MSC_VER)
+constexpr // Our current MSVC version has a bug that doesn't allow this to be
+          // constexpr.
+#endif
+    inline const char*
+    toString(AliasAnalysisKind aliasAnalysisKind) {
+  return (aliasAnalysisKind == AliasAnalysisKind::CONSERVATIVE) ? "CONSERVATIVE"
+      : (aliasAnalysisKind == AliasAnalysisKind::FROM_SCHEMA)   ? "FROM_SCHEMA"
+      : (aliasAnalysisKind == AliasAnalysisKind::PURE_FUNCTION)
+      ? "PURE_FUNCTION"
+      : (aliasAnalysisKind == AliasAnalysisKind::INTERNAL_SPECIAL_CASE)
+      ? "INTERNAL_SPECIAL_CASE"
+      : "UNKNOWN";
+}
+} // namespace c10
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/dispatch/RegistrationHandleRAII.h ADDED Viewed

	@@ -0,0 +1,41 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <functional>
+namespace c10 {
+class RegistrationHandleRAII final {
+ public:
+  explicit RegistrationHandleRAII(std::function<void()> onDestruction)
+      : onDestruction_(std::move(onDestruction)) {}
+  ~RegistrationHandleRAII() {
+    if (onDestruction_) {
+      onDestruction_();
+    }
+  }
+  RegistrationHandleRAII(const RegistrationHandleRAII&) = delete;
+  RegistrationHandleRAII& operator=(const RegistrationHandleRAII&) = delete;
+  RegistrationHandleRAII(RegistrationHandleRAII&& rhs) noexcept
+      : onDestruction_(std::move(rhs.onDestruction_)) {
+    rhs.onDestruction_ = nullptr;
+  }
+  RegistrationHandleRAII& operator=(RegistrationHandleRAII&& rhs) noexcept {
+    onDestruction_ = std::move(rhs.onDestruction_);
+    rhs.onDestruction_ = nullptr;
+    return *this;
+  }
+ private:
+  std::function<void()> onDestruction_;
+};
+} // namespace c10
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/adaption.h ADDED Viewed

	@@ -0,0 +1,86 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/List.h>
+#include <c10/core/TensorOptions.h>
+/*
+ * [Note: hacky wrapper removal for optional tensor]
+ *
+ * The kernel implementation takes an optional tensor marked in the schema as
+ * Tensor? but the C++ function takes Tensor instead of the std::optional<Tensor>
+ * expected by the dispatcher.
+ *
+ * To remove the hacky wrapper, the C++ function is changed to take
+ * std::optional<Tensor> and unwrap the Tensor value at the beginning of
+ * the function, e.g.:
+ *   > c10::MaybeOwned<Tensor> weight_maybe_owned =
+ *   >     at::borrow_from_optional_tensor(weight_opt);
+ *   > const Tensor& weight = *weight_maybe_owned;
+ *
+ * We may want to make the kernel handle optional directly without
+ * going through the creation of a default-constructed Tensor in
+ * at::borrow_from_optional_tensor.
+ */
+/*
+ * [Note: hacky wrapper removal for TensorOptions]
+ *
+ * The kernel implementation takes a TensorOptions argument but the dispatcher
+ * expects separate arguments for dtype, layout, device, pin_memory.
+ *
+ * To remove the hacky wrapper, the kernel implementation is changed to take
+ * the 4 arguments (dtype, layout, device, pin_memory), and assemble the
+ * TensorOptions value at the beginning of the function, e.g.:
+ *   > TensorOptions options = TensorOptions().dtype(dtype).layout(layout)
+ *   >    .device(device).pinned_memory(pin_memory);
+ *
+ * We may want make the kernel handle these parameters directly without going
+ * through the creation of a TensorOptions value.
+ */
+namespace c10::impl {
+TORCH_API void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName);
+inline void check_and_update_common_device(std::optional<Device>& common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  // TODO: Remove this once the following issue is addressed:
+  // https://github.com/pytorch/pytorch/issues/57380
+  if (!tensor.defined()) {
+    return;
+  }
+  if (!common_device.has_value()) {
+    common_device = tensor.device();
+    return;
+  }
+  if (C10_UNLIKELY(common_device != tensor.device())) {
+    common_device_check_failure(*common_device, tensor, methodName, argName);
+  }
+}
+inline void check_and_update_common_device(std::optional<Device>& common_device, const std::optional<at::Tensor>& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  if (tensor.has_value()) {
+    check_and_update_common_device(common_device, tensor.value(), methodName, argName);
+  }
+}
+inline void check_and_update_common_device(std::optional<Device>& common_device, at::ITensorListRef tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  for (const auto& tensor : tensors) {
+    check_and_update_common_device(common_device, tensor, methodName, argName);
+  }
+}
+inline void check_and_update_common_device(std::optional<Device>& common_device, const List<std::optional<at::Tensor>>& tensors, at::CheckedFrom methodName, at::CheckedFrom argName) {
+  for (const auto& tensor : tensors) {
+    check_and_update_common_device(common_device, tensor, methodName, argName);
+  }
+}
+} // namespace c10::impl
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/infer_schema.h ADDED Viewed

	@@ -0,0 +1,162 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+/**
+ * This file contains functionality to take a C++ function and infer its
+ * c10::FunctionSchema.
+ */
+#include <ATen/core/function_schema.h>
+#include <c10/util/Metaprogramming.h>
+namespace c10 {
+namespace detail::infer_schema {
+/// The templated inference code creates `ArgumentDef` instead of `Argument`,
+/// because that can be constructed at compile time and has a much smaller
+/// binary size than having calls to `Argument` constructors in the template.
+/// Creating `Argument` objects from `ArgumentDef` can then be done at
+/// runtime in a non-templated way.
+struct ArgumentDef final {
+  using GetTypeFn = TypePtr();
+  GetTypeFn* getTypeFn;
+  GetTypeFn* getFakeTypeFn;
+  constexpr ArgumentDef(): getTypeFn(nullptr), getFakeTypeFn(nullptr) {}
+  explicit constexpr ArgumentDef(GetTypeFn *getTypeFn, GetTypeFn *getFakeTypeFn): getTypeFn(getTypeFn), getFakeTypeFn(getFakeTypeFn) {}
+};
+template<bool V>
+struct bool_t {};
+template<> struct bool_t<true> : std::true_type {};
+template<> struct bool_t<false> : std::false_type {};
+/// Checks the static C++ types `Types` for correctness to catch common error cases.
+template <class... Types>
+constexpr int checkStaticTypes() {
+ // Give nice error messages for some of the common error cases.
+ // Use a LOUD ERROR MESSAGE SO USERS SEE THE STATIC_ASSERT
+ static_assert(std::conjunction_v<
+     bool_t<!std::is_integral_v<Types> || std::is_same_v<Types, int8_t> || std::is_same_v<Types, int64_t> || std::is_same_v<Types, bool>>...
+   >, "INVALID TYPE: Only int8_t, int64_t and bool are supported as an integral argument type");
+ static_assert(std::conjunction_v<
+     bool_t<!std::is_same_v<Types, float>>...
+   >, "INVALID TYPE: float is not supported as an argument type, use double instead");
+ return 0;
+}
+template <typename... Ts, size_t... Is>
+constexpr std::array<ArgumentDef, sizeof...(Ts)> createArgumentVectorFromTypes(std::index_sequence<Is...> /*unused*/) {
+  return (
+    // Check types for common errors
+    checkStaticTypes<Ts...>(),
+    // Create the return value
+    std::array<ArgumentDef, sizeof...(Ts)>{
+      ArgumentDef(&getTypePtrCopy<std::decay_t<Ts>>, &getFakeTypePtrCopy<std::decay_t<Ts>>)...}
+  );
+}
+/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified
+/// as template arguments.
+template<class ParameterTypes> struct createArguments final {};
+template<class... ParameterTypes>
+struct createArguments<guts::typelist::typelist<ParameterTypes...>> final {
+  static constexpr std::array<ArgumentDef, sizeof...(ParameterTypes)> call() {
+    return createArgumentVectorFromTypes<ParameterTypes...>(
+        std::make_index_sequence<sizeof...(ParameterTypes)>()
+    );
+  }
+};
+/// Creates a vector of `ArgumentDef` from a list of C++ types that are specified
+/// as a tuple (i.e. in the way c10 kernels return values).
+/// It can be a tuple<A, B, C> if there's three output arguments with types A, B, C.
+/// It can be an empty tuple<>, or void for kernels that don't return anything.
+/// It can be a single type A (i.e. no tuple) for the case where a kernel just
+/// returns one value.
+template<class ReturnTypeTuple, class Enable = void> struct createReturns final {};
+template<class... ReturnTypes>
+struct createReturns<std::tuple<ReturnTypes...>, void> final {
+  static constexpr std::array<ArgumentDef, sizeof...(ReturnTypes)> call() {
+    return createArgumentVectorFromTypes<ReturnTypes...>(
+        std::make_index_sequence<sizeof...(ReturnTypes)>()
+    );
+  }
+};
+template<class ReturnType>
+struct createReturns<ReturnType, std::enable_if_t<!std::is_same_v<void, ReturnType> && !guts::is_instantiation_of<std::tuple, ReturnType>::value>> final {
+  static constexpr std::array<ArgumentDef, 1> call() {
+    return createReturns<std::tuple<ReturnType>>::call();
+  }
+};
+template<>
+struct createReturns<void, void> final {
+  static constexpr std::array<ArgumentDef, 0> call() {
+    return createReturns<std::tuple<>>::call();
+  }
+};
+template <typename ReturnType>
+struct createSingleReturn {
+  static constexpr std::array<ArgumentDef, 1> call() {
+    return createArgumentVectorFromTypes<ReturnType>(std::make_index_sequence<1>());
+  }
+};
+TORCH_API FunctionSchema make_function_schema(std::string&& name, std::string&& overload_name, c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
+TORCH_API FunctionSchema make_function_schema(c10::ArrayRef<ArgumentDef> arguments, c10::ArrayRef<ArgumentDef> returns);
+/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a
+/// function. Flattens std::tuple returns into multiple return types
+template <typename FunctionTraits>
+FunctionSchema createFunctionSchemaFromTraitsFlattenedReturns() {
+ using ReturnType = typename FunctionTraits::return_type;
+ using ParameterTypes = typename FunctionTraits::parameter_types;
+ // arguments and returns are computed into a std::array at compile time and embedded into the binary.
+ // The only code executed at runtime here is the one that creates a std::vector
+ // of the arguments/returns from the std::array.
+ constexpr auto arguments = createArguments<ParameterTypes>::call();
+ constexpr auto returns = createReturns<ReturnType>::call();
+ return make_function_schema(arguments, returns);
+}
+/// Creates a `FunctionSchema` object from a `FunctionTraits` type for a
+/// function. Preserves std::tuple returns as a Tuple return type
+template <typename FunctionTraits>
+FunctionSchema createFunctionSchemaFromTraitsSingleReturn(std::string&& name, std::string&& overload_name) {
+ using ReturnType = typename FunctionTraits::return_type;
+ using ParameterTypes = typename FunctionTraits::parameter_types;
+ // arguments and returns are computed into a std::array at compile time and embedded into the binary.
+ // The only code executed at runtime here is the one that creates a std::vector
+ // of the arguments/returns from the std::array.
+ constexpr auto arguments = createArguments<ParameterTypes>::call();
+ constexpr auto returns = createSingleReturn<ReturnType>::call();
+ return make_function_schema(std::move(name), std::move(overload_name), arguments, returns);
+}
+}
+template<class FuncType>
+FunctionSchema inferFunctionSchemaFlattenedReturns() {
+  return detail::infer_schema::createFunctionSchemaFromTraitsFlattenedReturns<guts::infer_function_traits_t<FuncType>>();
+}
+template<class FuncType>
+FunctionSchema inferFunctionSchemaSingleReturn(std::string&& name, std::string&& overload_name) {
+  return detail::infer_schema::createFunctionSchemaFromTraitsSingleReturn<guts::infer_function_traits_t<FuncType>>(std::move(name), std::move(overload_name));
+}
+TORCH_API std::optional<std::string> findSchemaDifferences(const FunctionSchema& inferred, const FunctionSchema& specified);
+}
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_allowlist.h ADDED Viewed

	@@ -0,0 +1,186 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// TODO: unify to C10_MOBILE. In theory this header could be used in OSS.
+#ifdef TEMPLATE_SELECTIVE_BUILD
+#include <ATen/selected_mobile_ops.h>
+#endif
+/**
+ * This header implements functionality to build PyTorch with only a certain
+ * set of operators (+ dependencies) included.
+ *
+ * - Build with -DTORCH_OPERATOR_WHITELIST="aten::add;aten::sub" and only these
+ *   two ops will be included in your build.  The allowlist records operators
+ *   only, no overloads; if you include aten::add, all overloads of aten::add
+ *   will be included.
+ *
+ * Internally, this is done by removing the operator registration calls
+ * using compile time programming, and the linker will then prune all
+ * operator functions that weren't registered.
+ * See Note [Selective build] for more details
+ *
+ * WARNING: The allowlist mechanism doesn't work for all ways you could go about
+ * registering an operator.  If the dispatch key / operator name is not
+ * sufficiently obvious at compile time, then the allowlisting mechanism
+ * will fail (and the operator will be included in the binary anyway).
+ */
+#include <string_view>
+#include <c10/core/DispatchKey.h>
+#include <c10/macros/Macros.h>
+#if defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE)
+#include <ATen/record_function.h>
+#endif
+namespace c10::impl {
+constexpr bool allowlist_contains(std::string_view allowlist, std::string_view item);  // Forward Declare
+/**
+ * In selective build mode returns true/false depending on whether a build
+ * feature is available or not.
+ *
+ * In instrumenting mode (tracing mode), always returns true, and doesn't
+ * trigger any side effects.
+ */
+constexpr bool is_build_feature_available(const char* name) {
+#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE)
+  // Selective Build mode.
+#if !defined(TORCH_BUILD_FEATURE_ALLOWLIST)
+  (void)name;
+  return true;
+#else
+  return allowlist_contains(
+    C10_STRINGIZE(TORCH_BUILD_FEATURE_ALLOWLIST),
+    name);
+#endif
+#else
+  // Instrumenting mode.
+  (void)name;
+  return true;
+#endif
+}
+[[noreturn]] void build_feature_required_feature_not_available(const char* feature);
+/**
+ * Use BUILD_FEATURE_REQUIRED macro in user-code.
+ *
+ * In selective build mode becomes a no-op if the build feature passed
+ * in is available. If not available, throws an exception (c10::Error).
+ * The compiler is able to perform dead code elimination for code
+ * following this method if the build feature is not available.
+ *
+ * In instrumenting mode (tracing mode), registers (as a side effect)
+ * the presence of this specific build feature being triggered.
+ */
+#if !defined(ENABLE_RECORD_KERNEL_FUNCTION_DTYPE)  // selective build mode
+#if defined(TORCH_BUILD_FEATURE_ALLOWLIST)
+#define BUILD_FEATURE_REQUIRED(NAME)                                 \
+  if (!c10::impl::is_build_feature_available(NAME)) {                \
+    ::c10::impl::build_feature_required_feature_not_available(NAME); \
+  }
+#else  // Everything trivially selected
+#define BUILD_FEATURE_REQUIRED(NAME)
+#endif
+#else  // trace mode
+#define BUILD_FEATURE_REQUIRED(NAME)  \
+  RECORD_FUNCTION_WITH_SCOPE(         \
+      at::RecordScope::BUILD_FEATURE, \
+      std::string(NAME),              \
+      {});
+#endif
+// Use this macro, and not is_build_feature_available
+#define BUILD_FEATURE_AVAILABLE(NAME) ::c10::impl::is_build_feature_available(NAME)
+// returns true iff allowlist contains item
+// allowlist_contains("a;bc;d", "bc") == true
+constexpr bool allowlist_contains(std::string_view allowlist, std::string_view item) {
+    //Choose a really big value for next so that if something goes wrong
+    //this code will blow up in a hopefully detectable way.
+    size_t next = std::numeric_limits<size_t>::max();
+    for (size_t cur = 0; cur <= allowlist.size(); cur = next) {
+      next = allowlist.find(';', cur);
+      if (next != std::string_view::npos) {
+        if (allowlist.substr(cur, next - cur) == item) {
+          return true;
+        }
+        next++;
+      } else {
+        if (allowlist.substr(cur) == item) {
+          return true;
+        }
+        break;
+      }
+    }
+    return false;
+}
+// Returns true iff the given op name is on the allowlist
+// and should be registered
+constexpr bool op_allowlist_check(std::string_view op_name [[maybe_unused]]) {
+  assert(op_name.find("::") != std::string_view::npos);
+  // Use assert() instead of throw() due to a gcc bug. See:
+  // https://stackoverflow.com/questions/34280729/throw-in-constexpr-function
+  // https://github.com/fmtlib/fmt/issues/682
+  assert(op_name.find('(') == std::string_view::npos);
+#if !defined(TORCH_OPERATOR_WHITELIST)
+  // If the TORCH_OPERATOR_WHITELIST parameter is not defined,
+  // all ops are to be registered
+  return true;
+#else
+  return allowlist_contains(
+    C10_STRINGIZE(TORCH_OPERATOR_WHITELIST),
+    // This function is majorly used for mobile selective build with
+    // root operators, where the overload is included in the allowlist.
+    op_name);
+    // // Strip overload name (as allowlist doesn't contain overloads)
+    // // Another function based on this may be added when there's usage
+    // // on op names without overload.
+    // OperatorNameView::parse(op_name).name);
+#endif
+}
+// Returns true iff the given schema string is on the allowlist
+// and should be registered
+constexpr bool schema_allowlist_check(std::string_view schema) {
+#if defined(TORCH_FORCE_SCHEMA_REGISTRATION)
+  return true;
+#else
+  return op_allowlist_check(schema.substr(0, schema.find('(')));
+#endif
+}
+// Returns true iff the given custom class name is on the allowlist
+// and should be registered
+constexpr bool custom_class_allowlist_check(std::string_view custom_class_name [[maybe_unused]]) {
+#if !defined(TORCH_CUSTOM_CLASS_ALLOWLIST)
+  // If the TORCH_CUSTOM_CLASS_ALLOWLIST parameter is not defined,
+  // all custom classes are to be registered
+  return true;
+#else
+  return allowlist_contains(
+    C10_STRINGIZE(TORCH_CUSTOM_CLASS_ALLOWLIST),
+    custom_class_name);
+#endif
+}
+// schema_allowlist_check() implicitly depends on a macro, TORCH_OPERATOR_WHITELIST.
+// Add this API to pass arbitrary allowlist.
+constexpr bool op_allowlist_contains_name_in_schema(std::string_view allowlist, std::string_view schema) {
+  return allowlist_contains(allowlist, schema.substr(0, schema.find('(')));
+}
+} // namespace c10::impl
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/core/op_registration/op_registration.h ADDED Viewed

	@@ -0,0 +1,599 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+/**
+ * Include this file if you want to register operators. It includes all
+ * functionality needed to do so for you.
+ */
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/CompileTimeFunctionPointer.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/CppSignature.h>
+#include <ATen/core/dispatch/RegistrationHandleRAII.h>
+#include <ATen/core/op_registration/infer_schema.h>
+#if defined(EXPOSE_C2_OPS) || !defined(CAFFE2_IS_XPLAT_BUILD)
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
+#endif
+#include <ATen/core/ATenOpList.h>
+namespace c10 {
+namespace detail {
+// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
+// We do this because every argument in a function schema is expected to be convertible
+// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
+// See Note [Plumbing Keys Through The Dispatcher]
+template<class KernelFunctor>
+std::unique_ptr<FunctionSchema> inferFunctionSchemaFromFunctor() {
+  using func_type = typename c10::remove_DispatchKeySet_arg_from_func<KernelFunctor>::func_type;
+  return std::make_unique<FunctionSchema>(inferFunctionSchemaFlattenedReturns<func_type>());
+}
+}
+/**
+ * An instance of this class handles the registration for one or more operators.
+ * Make sure you keep the RegisterOperators instance around since it will
+ * deregister the operator it's responsible for in its destructor.
+ *
+ * Example:
+ *
+ * > namespace {
+ * >   class my_kernel_cpu final : public c10::OperatorKernel {
+ * >   public:
+ * >     Tensor operator()(Tensor a, Tensor b) {...}
+ * >   };
+ * > }
+ * >
+ * > static auto registry = c10::RegisterOperators()
+ * >     .op(c10::RegisterOperators::options()
+ * >         .schema("my_op")
+ * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+ */
+class TORCH_API RegisterOperators final {
+public:
+  RegisterOperators() = default;
+  ~RegisterOperators() = default;
+  RegisterOperators(const RegisterOperators&) = delete;
+  RegisterOperators& operator=(const RegisterOperators&) = delete;
+  RegisterOperators(RegisterOperators&&) noexcept = default;
+  RegisterOperators& operator=(RegisterOperators&&) noexcept = default;
+  class TORCH_API Options final {
+  public:
+    Options(const Options&) = delete;
+    Options(Options&&) noexcept = delete;
+    Options& operator=(const Options&) = delete;
+    Options& operator=(Options&&) noexcept = delete;
+    // internal-only for registering stack based kernels
+    template<KernelFunction::BoxedKernelFunction* kernel_func>
+    Options&& kernel(DispatchKey dispatch_key) && {
+      return std::move(*this).kernel(dispatch_key, KernelFunction::makeFromBoxedFunction<kernel_func>(), std::nullopt, nullptr);
+    }
+    // internal-only for registering stack based catch-all kernels
+    template<KernelFunction::BoxedKernelFunction* kernel_func>
+    Options&& catchAllKernel() && {
+      return std::move(*this).kernel(std::nullopt, KernelFunction::makeFromBoxedFunction<kernel_func>(), std::nullopt, nullptr);
+    }
+    // internal only for registering caffe2 ops
+    Options&& schema(FunctionSchema&& schema) {
+        TORCH_CHECK(!schemaOrName_.has_value(), "You can only specify the schema once per operator registration.");
+        schemaOrName_ = FunctionSchema(std::move(schema));
+        return std::move(*this);
+    }
+    /**
+     * Use this to specify the schema for an operator. You can also specify
+     * the operator name only to have the function signature part of the
+     * schema be inferred from the kernel function.
+     *
+     * Example:
+     *
+     * > // Infer function signature from my_kernel_cpu
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+     * >
+     * >
+     * > // Explicitly specify full schema
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op(Tensor a) -> Tensor")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+     */
+    Options&& schema(const std::string& schemaOrName) {
+      TORCH_CHECK(!schemaOrName_.has_value(), "Tried to register operator ", schemaOrName," but specified schema multiple times. You can only specify the schema once per operator registration.");
+      #if !defined(EXPOSE_C2_OPS) && defined(CAFFE2_IS_XPLAT_BUILD)
+        throw std::logic_error("Tried to register operator " + schemaOrName + ". We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build.");
+      #else
+        schemaOrName_ = torch::jit::parseSchemaOrName(schemaOrName);
+      #endif
+      return std::move(*this);
+    }
+    /**
+     * Use this to register an operator whose kernel is implemented as a functor.
+     * The kernel is only called for inputs matching the given dispatch key.
+     * You can register multiple kernels for different dispatch keys.
+     *
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU));
+     *
+     * The functor constructor can take arguments to configure the kernel.
+     * The arguments are defined in the kernel registration.
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     explicit my_kernel_cpu(std::string some_configuration, int a, bool b)
+     * >         : ... {...}
+     * >
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<my_kernel_cpu>(DispatchKey::CPU, "some_configuration", 3, true));
+     */
+    template<class KernelFunctor, class... ConstructorParameters>
+    // enable_if: only enable it if KernelFunctor is actually a functor
+    std::enable_if_t<guts::is_functor<KernelFunctor>::value, Options&&> kernel(DispatchKey dispatch_key, ConstructorParameters&&... constructorParameters) && {
+      static_assert(std::is_base_of_v<OperatorKernel, KernelFunctor>, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+      static_assert(std::is_constructible_v<KernelFunctor, ConstructorParameters...>, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedFunctor<false, KernelFunctor>(std::make_unique<KernelFunctor>(std::forward<ConstructorParameters>(constructorParameters)...)),
+        impl::CppSignature::make<KernelFunctor>(),
+        detail::inferFunctionSchemaFromFunctor<KernelFunctor>()
+      );
+    }
+    /**
+     * Use this to register an operator whose kernel is implemented as a functor.
+     * The kernel is a catch-all kernel, meaning it's called independent from
+     * the input. Dispatch is disabled for this operator.
+     *
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel<my_kernel_cpu>());
+     *
+     * The functor constructor can take arguments to configure the kernel.
+     * The arguments are defined in the kernel registration.
+     * Example:
+     *
+     * > namespace {
+     * >   class my_kernel_cpu final : public c10::OperatorKernel {
+     * >   public:
+     * >     explicit my_kernel_cpu(std::string some_configuration, int a, bool b)
+     * >         : ... {...}
+     * >
+     * >     Tensor operator()(Tensor a, Tensor b) {...}
+     * >   };
+     * > }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel<my_kernel_cpu>("some_configuration", 3, true));
+     */
+    template<class KernelFunctor, class... ConstructorParameters>
+    // enable_if: only enable it if KernelFunctor is actually a functor
+    std::enable_if_t<guts::is_functor<KernelFunctor>::value, Options&&> catchAllKernel(ConstructorParameters&&... constructorParameters) && {
+      static_assert(std::is_base_of_v<OperatorKernel, KernelFunctor>, "Tried to register a kernel functor using the kernel<Functor>() API, but it doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+      static_assert(std::is_constructible_v<KernelFunctor, ConstructorParameters...>, "Wrong argument list for constructor of kernel functor. The arguments to kernel<Functor>(arguments...) must match one of the constructors of Functor.");
+      return std::move(*this).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedFunctor<false, KernelFunctor>(std::make_unique<KernelFunctor>(std::forward<ConstructorParameters>(constructorParameters)...)),
+        impl::CppSignature::make<KernelFunctor>(),
+        detail::inferFunctionSchemaFromFunctor<KernelFunctor>()
+      );
+    }
+    /**
+     * Use this to register an operator whose kernel is implemented by a function.
+     * The kernel is only called for inputs matching the given dispatch key.
+     * You can register multiple kernels for different dispatch keys.
+     *
+     * Example:
+     *
+     * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel<decltype(my_kernel_cpu), &my_kernel_cpu>(DispatchKey::CPU));
+     */
+    template<class FuncType, FuncType* kernel_func>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> kernel(DispatchKey dispatch_key) && {
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr");
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<typename impl::WrapFunctionIntoFunctor<CompileTimeFunctionPointer<FuncType, kernel_func>>::type>()
+      );
+    }
+    /**
+     * Use this to register an operator whose kernel is implemented by a function.
+     * The kernel is a catch-all kernel, meaning it's called independent from
+     * the input. Dispatch is disabled for this operator.
+     *
+     * Example:
+     *
+     * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+     * >
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel<decltype(my_kernel_cpu), &my_kernel_cpu>());
+     */
+    template<class FuncType, FuncType* kernel_func>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> catchAllKernel() && {
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      static_assert(kernel_func != nullptr, "Kernel function cannot be nullptr");
+      return std::move(*this).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernel_func)),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<typename impl::WrapFunctionIntoFunctor<CompileTimeFunctionPointer<FuncType, kernel_func>>::type>()
+      );
+    }
+    template<class FuncType>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> kernel(DispatchKey dispatch_key, FuncType* kernel_func) && {
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr");
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>()
+      );
+    }
+    template<class FuncType>
+    // enable_if: only enable it if FuncType is actually a function
+    std::enable_if_t<guts::is_function_type<FuncType>::value, Options&&> catchAllKernel(FuncType* kernel_func) && {
+      static_assert(!std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, "Tried to register a stackbased (i.e. internal) kernel function using the public kernel<...>() API. Please either use the internal kernel(...) API or also implement the kernel function as defined by the public API.");
+      TORCH_INTERNAL_ASSERT(kernel_func != nullptr, "Kernel function cannot be nullptr");
+      return std::move(*this).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedRuntimeFunction(kernel_func),
+        impl::CppSignature::make<FuncType>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>()
+      );
+    }
+    /**
+     * Use this to register an operator whose kernel is implemented as a lambda.
+     * The kernel is only called for inputs matching the given dispatch key.
+     * You can register multiple kernels for different dispatch keys.
+     *
+     * The lambda must be stateless, i.e. not have a capture. If your kernel
+     * needs to store some configuration parameters, write the kernel as a
+     * functor instead.
+     *
+     * Example:
+     *
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .kernel(DispatchKey::CPU, [] (Tensor a) -> Tensor {...}));
+     */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is a functor (note: lambdas are functors)
+    std::enable_if_t<
+        guts::is_functor<std::decay_t<Lambda>>::value
+        && !std::is_same_v<typename guts::infer_function_traits_t<std::decay_t<Lambda>>::func_type, KernelFunction::BoxedKernelFunction>,
+        Options&&> kernel(DispatchKey dispatch_key, Lambda&& functor) && {
+      static_assert(!std::is_base_of_v<OperatorKernel, std::decay_t<Lambda>>, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel<Functor>() API instead.");
+      // We don't support stateful lambdas (i.e. lambdas with a capture), because their
+      // behavior would be nonobvious. A functor kernel with cache gets a new instance of
+      // its cache each time the kernel is looked up from the dispatch table.
+      // A lambda with a capture would be global and share its capture between all kernel lookups.
+      // So, instead of making users having to think about it (including the thread-safety
+      // issues this causes), let's just forbid stateful lambdas altogether.
+      static_assert(guts::is_stateless_lambda<std::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel<Functor>() instead.");
+      return std::move(*this).kernel(
+        dispatch_key,
+        KernelFunction::makeFromUnboxedLambda(std::forward<Lambda>(functor)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      );
+    }
+    /**
+     * Use this to register an operator whose kernel is implemented as a lambda.
+     * The kernel is a catch-all kernel, meaning it's called independent from
+     * the input. Dispatch is disabled for this operator.
+     *
+     * The lambda must be stateless, i.e. not have a capture. If your kernel
+     * needs to store some configuration parameters, write the kernel as a
+     * functor instead.
+     *
+     * Example:
+     *
+     * > static auto registry = c10::RegisterOperators()
+     * >     .op(c10::RegisterOperators::options()
+     * >         .schema("my_op")
+     * >         .catchAllKernel([] (Tensor a) -> Tensor {...}));
+     */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is a functor (note: lambdas are functors)
+    std::enable_if_t<
+        guts::is_functor<std::decay_t<Lambda>>::value
+        && !std::is_same_v<typename guts::infer_function_traits_t<std::decay_t<Lambda>>::func_type, KernelFunction::BoxedKernelFunction>,
+        Options&&> catchAllKernel(Lambda&& lambda) && {
+      static_assert(!std::is_base_of_v<OperatorKernel, std::decay_t<Lambda>>, "The kernel(x) API for registering a kernel is only meant to be used with lambdas. Your kernel is a functor. Please use the kernel<Functor>() API instead.");
+      // We don't support stateful lambdas (i.e. lambdas with a capture), because their
+      // behavior would be nonobvious.
+      // A lambda with a capture would be global and share its capture between all kernel lookups.
+      // This would be a likely source for unexpected race conditions, so we forbid it.
+      // If a kernel really needs global state, they can just have regular global state
+      // in their .cpp file next to the kernel lambda.
+      static_assert(guts::is_stateless_lambda<std::decay_t<Lambda>>::value, "The kernel(x) API for registering a kernel only works for stateless lambdas (i.e. lambdas without captures). If you need a cache, please use the functor based API kernel<Functor>() instead.");
+      return std::move(*this).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedLambda(std::forward<Lambda>(lambda)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      );
+    }
+    Options&& aliasAnalysis(AliasAnalysisKind aliasAnalysisKind) && {
+      TORCH_CHECK(!aliasAnalysisKind_.has_value(), "You can only call aliasAnalysis() once per operator registration.");
+      aliasAnalysisKind_ = aliasAnalysisKind;
+      return std::move(*this);
+    }
+  private:
+    Options&& kernel(std::optional<DispatchKey> dispatch_key, KernelFunction&& func, std::optional<impl::CppSignature> cpp_signature, std::unique_ptr<FunctionSchema>&& inferred_function_schema) && {
+      KernelRegistrationConfig config;
+      config.dispatch_key = dispatch_key;
+      config.func = std::move(func);
+      config.cpp_signature = cpp_signature;
+      config.inferred_function_schema = std::move(inferred_function_schema);
+      kernels.push_back(std::move(config));
+      return std::move(*this);
+    }
+    Options()
+    : schemaOrName_(std::nullopt)
+    , aliasAnalysisKind_(std::nullopt)
+    {}
+    // KernelRegistrationConfig accumulates all information from the config
+    // parameters passed to a RegisterOperators::op() call into one object.
+    struct KernelRegistrationConfig final {
+      KernelRegistrationConfig()
+        : dispatch_key(std::nullopt)
+        , cpp_signature(std::nullopt)
+        , inferred_function_schema(nullptr)
+      {}
+      std::optional<DispatchKey> dispatch_key;
+      KernelFunction func;
+      std::optional<impl::CppSignature> cpp_signature;
+      std::unique_ptr<FunctionSchema> inferred_function_schema;
+    };
+    std::optional<std::variant<OperatorName, FunctionSchema>> schemaOrName_;
+    std::vector<KernelRegistrationConfig> kernels;
+    std::optional<AliasAnalysisKind> aliasAnalysisKind_;
+    friend class RegisterOperators;
+    friend class Library;
+  };
+  /**
+   * Call this to get an instance of registration options, which
+   * can be passed to a call to RegisterOperators::op() to specify
+   * these options for the operator registration.
+   * See class doc comment for examples.
+   */
+  static Options options() {
+    return {};
+  }
+  /**
+   * Call this to register an operator. See class doc comment for examples.
+   */
+  RegisterOperators&& op(Options&& options) && {
+    checkSchemaAndRegisterOp_(std::move(options));
+    return std::move(*this);
+  }
+  // Regular mutator version of the && version above
+  RegisterOperators& op(Options&& options) & {
+    checkSchemaAndRegisterOp_(std::move(options));
+    return *this;
+  }
+  /**
+   * This is a shorthand for RegisterOperators::op(Options) where you can
+   * specify the operator schema outside of the options parameter.
+   * See class doc comment for examples.
+   */
+  RegisterOperators&& op(const std::string& schemaOrName, Options&& options = RegisterOperators::options()) && {
+    return std::move(*this).op(std::move(options).schema(schemaOrName));
+  }
+  // internal only for registering caffe2 ops
+  RegisterOperators&& op(FunctionSchema schema, Options&& options) && {
+    return std::move(*this).op(std::move(options).schema(std::move(schema)));
+  }
+  template<class FuncType>
+  explicit RegisterOperators(const std::string& schemaOrName, FuncType&& func, Options&& options = RegisterOperators::options())
+  : RegisterOperators() {
+    std::move(*this).op(schemaOrName, std::forward<FuncType>(func), std::move(options));
+  }
+  /**
+   * This API registers an operator based on a kernel function pointer.
+   *
+   * Given a kernel
+   *
+   * > namespace { Tensor my_kernel_cpu(Tensor a, Tensor b) {...} }
+   *
+   * This API looks like:
+   *
+   * > static auto registry = c10::RegisterOperators()
+   * >     .op("my_op", &my_kernel_cpu);
+   *
+   * If your kernel is small and the overhead of calling it matters,
+   * then this API might be the wrong choice since the following API
+   * has a slightly lower overhead for calling into the kernel:
+   *
+   * > static auto registry = c10::RegisterOperators()
+   * >     .op("my_op", c10::RegisterOperators::options()
+   * >         .kernel<decltype(my_kernel_cpu), &my_kernel_cpu>());
+   *
+   * Or, alternatively, write your kernel as a functor:
+   *
+   * > namespace {
+   * >   class my_kernel_cpu final : public c10::OperatorKernel {
+   * >   public:
+   * >     Tensor operator()(Tensor a, Tensor b) {...}
+   * >   };
+   * > }
+   * >
+   * > static auto registry = c10::RegisterOperators()
+   * >     .op("my_op", c10::RegisterOperators::options()
+   * >         .kernel<my_kernel_cpu>());
+   */
+   template<class FuncType>
+   // enable_if: only enable it if FuncType is actually a function, but not a stack based BoxedKernelFunction.
+   std::enable_if_t<guts::is_function_type<FuncType>::value && !std::is_same_v<FuncType, KernelFunction::BoxedKernelFunction>, RegisterOperators&&>
+   op(const std::string& schemaOrName, FuncType* func, Options&& options = RegisterOperators::options()) && {
+     constexpr bool AllowLegacyTypes = true;
+     return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
+       std::nullopt,
+       KernelFunction::makeFromUnboxedRuntimeFunction<AllowLegacyTypes>(func),
+       impl::CppSignature::make<FuncType>(),
+       // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+       detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>()
+     ));
+   }
+   /**
+    * This API registers an operator based on a kernel lambda.
+    *
+    * This API looks like:
+    *
+    * > static auto registry = c10::RegisterOperators()
+    * >     .op("my_op", [] (Tensor a, Tensor b) {...});
+    *
+    * This is equivalent to:
+    *
+    * > static auto registry = c10::RegisterOperators()
+    * >     .op("my_op", c10::RegisterOperators::options()
+    * >         .catchAllKernel([] (Tensor a, Tensor b) {...}));
+    *
+    */
+    template<class Lambda>
+    // enable_if: only enable it if Lambda is actually a stateless lambda
+    std::enable_if_t<guts::is_functor<Lambda>::value && guts::is_stateless_lambda<std::decay_t<Lambda>>::value, RegisterOperators&&>
+    op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && {
+      static_assert(!std::is_base_of_v<OperatorKernel, Lambda>, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
+      constexpr bool AllowLegacyTypes = true;
+      return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedLambda<AllowLegacyTypes>(std::forward<Lambda>(lambda)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      ));
+    }
+    template<class Lambda>
+    C10_DEPRECATED_MESSAGE("Registering operator kernels with stateful lambdas (i.e. lambdas with a capture) has non-obvious behavior. This is deprecated. Please use a lambda without a capture or a functor class instead.")
+    // enable_if: only enable it if Lambda is actually a functor but not a stateless lambda
+    std::enable_if_t<guts::is_functor<Lambda>::value && !guts::is_stateless_lambda<std::decay_t<Lambda>>::value, RegisterOperators&&>
+    op(const std::string& schemaOrName, Lambda&& lambda, Options&& options = RegisterOperators::options()) && {
+      static_assert(!std::is_base_of_v<OperatorKernel, Lambda>, "c10::OperatorKernel is part of the new kernel registration API and shouldn't be used together with the deprecated registration API. Please use the new RegisterOperators::options().kernel() based API instead.");
+      constexpr bool AllowLegacyTypes = true;
+      return std::move(*this).op(std::move(options).schema(schemaOrName).kernel(
+        std::nullopt,
+        KernelFunction::makeFromUnboxedLambda<AllowLegacyTypes>(std::forward<Lambda>(lambda)),
+        impl::CppSignature::make<Lambda>(),
+        // TODO Do schema inference without relying on WrapFunctionIntoRuntimeFunctor
+        detail::inferFunctionSchemaFromFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>()
+      ));
+    }
+private:
+  void checkSchemaAndRegisterOp_(Options&& config);
+  static c10::FunctionSchema inferSchemaFromKernels_(const OperatorName& opNameStr, const Options& options);
+  void checkNoDuplicateKernels_(const Options& options);
+  void registerOp_(Options&& options);
+  std::vector<RegistrationHandleRAII> registrars_;
+};
+} // namespace c10
+namespace torch {
+  // Old-style API
+  using RegisterOperators = c10::RegisterOperators;
+}
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/FlushDenormal.h ADDED Viewed

	@@ -0,0 +1,19 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/// Flush-To-Zero and Denormals-Are-Zero mode
+///
+/// Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) are modes that bypass
+/// IEEE 754 methods of dealing with denormal floating-point numbers on x86-64
+/// and some x86 CPUs. They result in reduced precision for values near zero,
+/// but increased performance.
+///
+/// See https://software.intel.com/en-us/articles/x87-and-sse-floating-point-assists-in-ia-32-flush-to-zero-ftz-and-denormals-are-zero-daz
+namespace at::cpu {
+bool set_flush_denormal(bool on);
+}  // namespace at::cpu
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/Utils.h ADDED Viewed

	@@ -0,0 +1,38 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <cstdint>
+#include <c10/macros/Export.h>
+namespace at::cpu {
+TORCH_API bool is_avx2_supported();
+TORCH_API bool is_avx512_supported();
+// Detect if CPU support Vector Neural Network Instruction.
+TORCH_API bool is_avx512_vnni_supported();
+// Detect if CPU supports AVX512_BF16 ISA
+TORCH_API bool is_avx512_bf16_supported();
+// Detect if CPU support Advanced Matrix Extension.
+TORCH_API bool is_amx_tile_supported();
+// Detect if CPU support Advanced Matrix Extension for fp16.
+TORCH_API bool is_amx_fp16_supported();
+// Enable the system to use AMX instructions.
+TORCH_API bool init_amx();
+// Get the L1 cache size per core in Byte
+TORCH_API uint32_t L1d_cache_size();
+// Get the L2 cache size per core in Byte
+TORCH_API uint32_t L2_cache_size();
+} // namespace at::cpu
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpu/vml.h ADDED Viewed

	@@ -0,0 +1,175 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/Config.h>
+#include <ATen/Parallel.h>
+#include <ATen/OpMathType.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <c10/util/complex.h>
+// This header implements various unary operations using a MKL VML style
+// interface.
+// It implements various functions with a simple interface
+// For example it enables the user to call vsin(float* out, const float* in,
+// size) This functions takes a pointer to a continuous output array of floats and
+// a constant input array. It will then apply sin to each value in the input
+// array and write the result into the output array. out and in may point to the
+// same memory, i.e. this fully supports in-place operations. These functions
+// also implement their own parallelization, so take precautions when calling
+// these from threaded functions.
+// When MKL is available it will call into MKL's VML library similar to NumPy
+// If MKL is not available it will use SLEEF.
+// This file might be compiled under AVX or AVX2 when called from e.g.
+// UnaryOpsKernel.cpp
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+#if AT_MKL_ENABLED() && !defined(__APPLE__)
+#include <mkl.h>
+#endif
+namespace at::vml {
+inline namespace CPU_CAPABILITY {
+using namespace vec;
+template <typename scalar_t>
+inline void vrsqrt(scalar_t* out, scalar_t* in, int64_t size) {
+  parallel_for(0, size, 2048, [out, in](int64_t begin, int64_t end) {
+    map(
+        [](const Vectorized<scalar_t>& x) {
+          return Vectorized<scalar_t>((scalar_t)1) / x.sqrt();
+        },
+        out + begin,
+        in + begin,
+        end - begin);
+  });
+}
+// NB: We ignore numerical errors by convention and leave them to the user
+#define IMPLEMENT_VML(op)                                               \
+  template <typename scalar_t>                                          \
+  inline void v##op(scalar_t* out, const scalar_t* in, int64_t size) {  \
+    using vec_t = Vectorized<vec_scalar_t<scalar_t>>;                   \
+    vec::map([](vec_t x) { return x.op(); }, out, in, size);            \
+  }                                                                     \
+IMPLEMENT_VML(abs)
+IMPLEMENT_VML(acos)
+IMPLEMENT_VML(asin)
+IMPLEMENT_VML(atan)
+IMPLEMENT_VML(atanh)
+IMPLEMENT_VML(ceil)
+IMPLEMENT_VML(cos)
+// IMPLEMENT_VML(cosh)
+IMPLEMENT_VML(erf)
+IMPLEMENT_VML(erfc)
+IMPLEMENT_VML(erfinv)
+IMPLEMENT_VML(exp)
+IMPLEMENT_VML(expm1)
+IMPLEMENT_VML(floor)
+IMPLEMENT_VML(i0)
+IMPLEMENT_VML(i0e)
+IMPLEMENT_VML(digamma)
+IMPLEMENT_VML(reciprocal)
+IMPLEMENT_VML(log)
+IMPLEMENT_VML(log10)
+IMPLEMENT_VML(log1p)
+IMPLEMENT_VML(log2)
+IMPLEMENT_VML(neg)
+IMPLEMENT_VML(sin)
+// IMPLEMENT_VML(sinh)
+IMPLEMENT_VML(sqrt)
+IMPLEMENT_VML(round)
+IMPLEMENT_VML(rsqrt)
+IMPLEMENT_VML(tan)
+IMPLEMENT_VML(tanh)
+IMPLEMENT_VML(trunc)
+IMPLEMENT_VML(lgamma)
+#if AT_MKL_ENABLED() && !defined(__APPLE__)
+// NB: LP64 MKL is the most commonly used and thus we assume it here. That means
+// we need to expect MKL_INT to be of type int, which implies int32_t or int64_t in most
+// cases.
+static_assert(
+    std::is_same_v<MKL_INT, int32_t> || std::is_same_v<MKL_INT, int64_t>,
+    "MKL_INT is assumed to be int32_t or int64_t");
+#define IMPLEMENT_VML_MKL_STUB(op, mklop, type, mkltype)                \
+  template <>                                                           \
+  inline void v##op(type * out, const type * in, int64_t size) {        \
+    auto constexpr max_mkl_ind = std::numeric_limits<MKL_INT>::max();   \
+    if (size <= static_cast<int64_t>(max_mkl_ind)) {                    \
+      vm##mkltype##mklop(                                               \
+          size, in, out, VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE); \
+    } else {                                                            \
+      int64_t ind = 0;                                                  \
+      int64_t chunks = size / max_mkl_ind;                              \
+      int64_t rest = size % max_mkl_ind;                                \
+      for (; ind < chunks; ind++) {                                     \
+        vm##mkltype##mklop(                                             \
+            max_mkl_ind,                                                \
+            in + ind * max_mkl_ind,                                     \
+            out + ind * max_mkl_ind,                                    \
+            VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE);              \
+      }                                                                 \
+      vm##mkltype##mklop(                                               \
+          rest,                                                         \
+          in + ind * max_mkl_ind,                                       \
+          out + ind * max_mkl_ind,                                      \
+          VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE);                \
+    }                                                                   \
+  }
+#define IMPLEMENT_VML_MKL(op, mklop)          \
+  IMPLEMENT_VML_MKL_STUB(op, mklop, float, s) \
+  IMPLEMENT_VML_MKL_STUB(op, mklop, double, d)
+// NB: abs, cosh and sinh were temporarily disabled due to issues with Apple
+// NB: expm1 is disabled because on some configs it produces expm1(nan)=-1
+IMPLEMENT_VML_MKL(acos, Acos)
+IMPLEMENT_VML_MKL(asin, Asin)
+IMPLEMENT_VML_MKL(atan, Atan)
+IMPLEMENT_VML_MKL(cos, Cos)
+// IMPLEMENT_VML_MKL(cosh, Cosh)
+IMPLEMENT_VML_MKL(erf, Erf)
+IMPLEMENT_VML_MKL(erfc, Erfc)
+IMPLEMENT_VML_MKL(erfinv, ErfInv)
+IMPLEMENT_VML_MKL(exp, Exp)
+// IMPLEMENT_VML_MKL(expm1, Expm1)
+IMPLEMENT_VML_MKL(log, Ln)
+IMPLEMENT_VML_MKL(log10, Log10)
+IMPLEMENT_VML_MKL(sin, Sin)
+// IMPLEMENT_VML_MKL(sinh, Sinh)
+IMPLEMENT_VML_MKL(sqrt, Sqrt)
+IMPLEMENT_VML_MKL(tan, Tan)
+IMPLEMENT_VML_MKL(tanh, Tanh)
+IMPLEMENT_VML_MKL(trunc, Trunc)
+// Not vectorized in MKL version tested
+// IMPLEMENT_VML_MKL(abs, Abs)
+// IMPLEMENT_VML_MKL(log1p, Log1p)
+#if INTEL_MKL_VERSION >= 20180406
+IMPLEMENT_VML_MKL(log2, Log2)
+#endif
+#endif
+} // namespace
+} // namespace at::vml
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/BLASConstants.h ADDED Viewed

	@@ -0,0 +1,16 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/TensorBase.h>
+namespace at::cuda::detail {
+float *get_cublas_device_one();
+float *get_cublas_device_zero();
+float *get_user_alpha_ptr();
+} // namespace at::cuda::detail
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h ADDED Viewed

	@@ -0,0 +1,76 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/Generator.h>
+// TODO: No need to have this whole header, we can just put it all in
+// the cpp file
+namespace at::cuda::detail {
+// Set the callback to initialize Magma, which is set by
+// torch_cuda_cu. This indirection is required so magma_init is called
+// in the same library where Magma will be used.
+TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());
+// The real implementation of CUDAHooksInterface
+struct CUDAHooks : public at::CUDAHooksInterface {
+  CUDAHooks(at::CUDAHooksArgs /*unused*/) {}
+  void init() const override;
+  Device getDeviceFromPtr(void* data) const override;
+  bool isPinnedPtr(const void* data) const override;
+  const Generator& getDefaultGenerator(
+      DeviceIndex device_index = -1) const override;
+  Generator getNewGenerator(
+      DeviceIndex device_index = -1) const override;
+  bool hasCUDA() const override;
+  bool hasMAGMA() const override;
+  bool hasCuDNN() const override;
+  bool hasCuSOLVER() const override;
+  bool hasCuBLASLt() const override;
+  bool hasROCM() const override;
+  bool hasCKSDPA() const override;
+  bool hasCKGEMM() const override;
+  const at::cuda::NVRTC& nvrtc() const override;
+  DeviceIndex current_device() const override;
+  bool isBuilt() const override {return true;}
+  bool isAvailable() const override {return hasCUDA();}
+  bool hasPrimaryContext(DeviceIndex device_index) const override;
+  Allocator* getCUDADeviceAllocator() const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+  bool compiledWithCuDNN() const override;
+  bool compiledWithMIOpen() const override;
+  bool supportsDilatedConvolutionWithCuDNN() const override;
+  bool supportsDepthwiseConvolutionWithCuDNN() const override;
+  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+  bool supportsBFloat16RNNWithCuDNN() const override;
+  bool hasCUDART() const override;
+  long versionCUDART() const override;
+  long versionCuDNN() const override;
+  long versionRuntimeCuDNN() const override;
+  long versionCuDNNFrontend() const override;
+  long versionMIOpen() const override;
+  std::string showConfig() const override;
+  double batchnormMinEpsilonCuDNN() const override;
+  int64_t cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const override;
+  void cuFFTSetPlanCacheMaxSize(DeviceIndex device_index, int64_t max_size) const override;
+  int64_t cuFFTGetPlanCacheSize(DeviceIndex device_index) const override;
+  void cuFFTClearPlanCache(DeviceIndex device_index) const override;
+  int getNumGPUs() const override;
+  DeviceIndex deviceCount() const override;
+  DeviceIndex getCurrentDevice() const override;
+#ifdef USE_ROCM
+  bool isGPUArch(const std::vector<std::string>& archs, DeviceIndex device_index = -1) const override;
+#endif
+  void deviceSynchronize(DeviceIndex device_index) const override;
+};
+} // at::cuda::detail
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h ADDED Viewed

	@@ -0,0 +1,156 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Some stateful GPU libraries, such as cuDNN, cuBLAS, use handles to store states.
+// These handles are tied to device, and these libraries requires/recommends not to
+// share handles across host threads.
+//
+// These libraries recommend using one handle per host thread. We may not want to do
+// this because threads are relatively light-weight, but creating and destroying
+// handles is expensive (destroying the handle causes synchronizations). DataParallel,
+// for example, creates new threads for each forward pass.
+//
+// This file implements a handle pool mechanism. The handle pool returns handles on
+// demand as threads request them. If all existing handles in the pool are in use,
+// it creates a new one. As threads terminate, they release handles back into the pool.
+// In this way, the handle pool never creates more handles than the high-water mark of
+// active threads, so it's efficient with DataParallel.
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include <utility>
+#include <mutex>
+#include <memory>
+#include <c10/util/Exception.h>
+namespace at::cuda { namespace {
+template <typename Handle_t, void Create(Handle_t *), void Destroy(Handle_t)>
+struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThreadHandlePool<Handle_t, Create, Destroy>> {
+    struct Handle {
+    Handle_t handle;
+    Handle(bool create = false) : handle(nullptr)
+    {
+        if(create) Create(&handle);
+    }
+    // std::vector.emplace() and push_back() may route through temporaries and call
+    // copy/move constructors along the way.  If this is the case, we don't want
+    // the destructors of temporaries to call cudnnDestroy on the handle.
+    // We can achieve safety (for the narrow case of stashing within std::vectors)
+    // by making Handle moveable but not copyable, and transferring handle ownership
+    // to the latest constructed object.  This is not a substitute for full-blown
+    // reference counting, but reference counting may be overkill here.
+    // Another alternative is to wrap the saved Handles in unique_ptrs, i.e.,
+    // unordered_map<int, vector<unique_ptr<Handle>>> created_handles;
+    Handle(const Handle& rhs) = delete;
+    // Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
+    Handle(Handle&& rhs) noexcept : Handle() { std::swap(handle, rhs.handle); }
+    // operator= takes argument by value
+    Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; }
+    ~Handle() {
+        if(handle) Destroy(handle);
+    }
+    };
+    std::mutex mutex;
+    // Handles are lazily created as different threads request them,
+    // but are never destroyed until the end of the process.
+    // The maximum number of handles this process will create for each device is equal
+    // to the high-water mark of the number of concurrently active threads that request
+    // handles for that device.
+    // When threads terminate, they release their handles back into the pool for reuse.
+    // Otherwise, new handles would be created every time new threads were spawned,
+    // resulting in poor performance for Python modules that repeatedly or frequently
+    // spawned new sets of threads (like DataParallel, which creates a new set of threads
+    // for each forward pass).
+    //
+    // To prevent potential deadlocks, we explicitly choose not to cap the number
+    // of handles that are created per device.
+    // Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device,
+    // only 4 can make forward progress at any time. The other 4 will not release their
+    // handles until they exit, so the fifth cannot make progress until then.  This is
+    // not a problem...UNLESS all 5 threads attempt some sort of synchronization at an
+    // intermediate point (ie, before any of them have exited).  We have no way to anticipate
+    // or enforce that user threads will not attempt such intermediate synchronization.
+    // The only way to ensure safety is to avoid imposing a cap on the number of handles.
+    std::unordered_map<int, std::vector<Handle>> created_handles;
+    std::unordered_map<int, std::vector<Handle_t>> available_handles;
+    // PoolWindow lazily creates and caches the handles that a particular thread is using,
+    // so in the common case handle access doesn't incur either handle creation or a mutex lock.
+    class PoolWindow
+    {
+    public:
+    PoolWindow(std::shared_ptr<DeviceThreadHandlePool> parent): weak_parent(std::move(parent)) {}
+    ~PoolWindow(){ release(); }
+    Handle_t reserve(int device)
+    {
+        // If this thread already has a handle for this device, return it
+        if(my_handles.find(device) != my_handles.end())
+        return my_handles[device];
+        // otherwise, either grab a handle from the pool if one is available,
+        // or if not, create a new one.
+        auto parent = weak_parent.lock();
+        TORCH_CHECK(parent, "Cannot create handle during program termination");
+        std::lock_guard<std::mutex> guard(parent->mutex);
+        if(parent->available_handles[device].size() > 0)
+        {
+        my_handles[device] = parent->available_handles[device].back();
+        parent->available_handles[device].pop_back();
+        }
+        else
+        {
+        // In local testing, I do observe that emplace_back sometimes routes through temporaries
+        // that incur move-constructor and destructor calls.  See comments in Handle above.
+        parent->created_handles[device].emplace_back(true /*create*/);
+        my_handles[device] = parent->created_handles[device].back().handle;
+        }
+        return my_handles[device];
+    }
+    private:
+    // Stores the per-device handles currently owned by this thread
+    std::unordered_map<int, Handle_t> my_handles;
+    std::weak_ptr<DeviceThreadHandlePool> weak_parent;
+    // Called by the destructor.  Releases this thread's handles back into the pool.
+    void release() {
+        if(!my_handles.empty()) {
+            auto parent = weak_parent.lock();
+            if (!parent) {
+                // If this thread exits after atexit handlers have completed, the
+                // cuda context itself may be invalid, so we must leak the handles.
+                return;
+            }
+            std::lock_guard<std::mutex> guard(parent->mutex);
+            for(auto d_h : my_handles)
+                parent->available_handles[d_h.first].push_back(d_h.second);
+        }
+    }
+    };
+    // Warning:
+    // If you want to change this function, be aware that this function will be called
+    // by multiple threads and there is no mutex guarding the call of this function, so
+    // make sure your implementation is thread-safe.
+    PoolWindow *newPoolWindow() {
+        // The returned pointer will be owned by a thread local variable
+        // so that different threads does not share the same PoolWindow.
+        return new PoolWindow(this->shared_from_this());
+    }
+};
+}}  // namespace at::cuda::detail::<anonymous>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh ADDED Viewed

	@@ -0,0 +1,41 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/TensorBase.h>
+#include <ATen/cuda/detail/TensorInfo.cuh>
+#include <ATen/native/CanUse32BitIndexMath.h>
+namespace at::cuda::detail {
+TORCH_CUDA_CU_API bool maybeOverlappingIndices(const at::TensorBase &t);
+using at::native::canUse32BitIndexMath;
+template <typename scalar, typename IndexType>
+TensorInfo<scalar, IndexType>
+getTensorInfo(const at::TensorBase &t) {
+  IndexType sz[MAX_TENSORINFO_DIMS];
+  IndexType st[MAX_TENSORINFO_DIMS];
+  int dims = t.dim();
+  for (int i = 0; i < dims; ++i) {
+    sz[i] = t.size(i);
+    st[i] = t.stride(i);
+  }
+  scalar* data_ptr = nullptr;
+  if constexpr (std::is_const_v<scalar>) {
+    data_ptr = t.const_data_ptr<scalar>();
+  } else {
+    data_ptr = t.mutable_data_ptr<scalar>();
+  }
+  return TensorInfo<scalar, IndexType>(
+    data_ptr, dims, sz, st);
+}
+} // namespace at::cuda::detail
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh ADDED Viewed

	@@ -0,0 +1,129 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <assert.h>
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#include <cuda_runtime.h>
+#endif
+namespace at::cuda::detail {
+// A utility class to implement integer division by multiplication, given a fixed
+// divisor.
+//
+// WARNING: The fast divider algorithm is only implemented for unsigned int;
+//          otherwise we default to plain integer division.  For unsigned int,
+//          we further assume that the dividend is at most INT32_MAX.  Thus,
+//          IntDivider must NOT be used for general integer division.
+//
+//          This reduced range is enough for our purpose, and it allows us to
+//          slightly simplify the computation.
+//
+// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.)
+//
+// For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N
+// <= m < 2^(N+1)) and shift s such that:
+//
+//    \floor(n / d) = \floor((m * n) / 2^(N+s)).
+//
+// Given such m and s, the integer division can be then implemented as:
+//
+//    let m' = m - 2^N  // 0 <= m' < 2^N
+//
+//    fast_integer_division(n):
+//      // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned
+//      // integer.  Then take the higher N bits.
+//      t = (m' * n) >> N
+//
+//      // Here we use the fact that n is less than 2^(N-1): otherwise the value
+//      // of (t + n) may not fit in an N-bit integer.
+//      return (t + n) >> s
+//
+// Finding such a magic number is surprisingly easy:
+//
+//    s  = \ceil(\log_2 d)
+//    m' = \floor(2^N * (2^s - d) / d) + 1  // Need 2N-bit integer arithmetic.
+//
+// See also:
+//    - Division by Invariant Integers Using Multiplication,
+//      Torbjörn Granlund and Peter L. Montgomery, 1994.
+//
+//    - http://www.hackersdelight.org/magic.htm
+//
+//    - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+// Result of div/mod operation stored together.
+template <typename Value>
+struct DivMod {
+  Value div, mod;
+  C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { }
+};
+// Base case: we only have an implementation for uint32_t for now.  For
+// everything else, we use plain division.
+template <typename Value>
+struct IntDivider {
+  IntDivider() = default;
+  IntDivider(Value d) : divisor(d) { }
+  C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; }
+  C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; }
+  C10_HOST_DEVICE inline DivMod<Value> divmod(Value n) const {
+    return DivMod<Value>(n / divisor, n % divisor);
+  }
+  Value divisor;
+};
+// Implement fast integer division.
+template <>
+struct IntDivider<unsigned int> {
+  static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");
+  IntDivider() = default;
+  IntDivider(unsigned int d) : divisor(d) {
+    assert(divisor >= 1 && divisor <= INT32_MAX);
+    // TODO: gcc/clang has __builtin_clz() but it's not portable.
+    for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break;
+    uint64_t one = 1;
+    uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+    m1 = magic;
+    assert(m1 > 0 && m1 == magic);  // m1 must fit in 32 bits.
+  }
+  C10_HOST_DEVICE inline unsigned int div(unsigned int n) const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and
+    // 'm1'.
+    unsigned int t = __umulhi(n, m1);
+    return (t + n) >> shift;
+#else
+    // Using uint64_t so that the addition does not overflow.
+    uint64_t t = ((uint64_t) n * m1) >> 32;
+    return (t + n) >> shift;
+#endif
+  }
+  C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const {
+    return n - div(n) * divisor;
+  }
+  C10_HOST_DEVICE inline DivMod<unsigned int> divmod(unsigned int n) const {
+    unsigned int q = div(n);
+    return DivMod<unsigned int>(q, n - q * divisor);
+  }
+  unsigned int divisor;  // d above.
+  unsigned int m1;  // Magic number: m' above.
+  unsigned int shift;  // Shift amounts.
+};
+}  // namespace at::cuda::detail
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h ADDED Viewed

	@@ -0,0 +1,42 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <limits>
+#include <c10/util/Exception.h>
+namespace at::cuda::detail {
+// CUDA: grid stride looping
+//
+// int64_t _i_n_d_e_x specifically prevents overflow in the loop increment.
+// If input.numel() < INT_MAX, _i_n_d_e_x < INT_MAX, except after the final
+// iteration of the loop where _i_n_d_e_x += blockDim.x * gridDim.x can be
+// greater than INT_MAX.  But in that case _i_n_d_e_x >= n, so there are no
+// further iterations and the overflowed value in i=_i_n_d_e_x is not used.
+#define CUDA_KERNEL_LOOP_TYPE(i, n, index_type)                         \
+  int64_t _i_n_d_e_x = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;           \
+  for (index_type i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
+#define CUDA_KERNEL_LOOP(i, n) CUDA_KERNEL_LOOP_TYPE(i, n, int)
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int CUDA_NUM_THREADS = 1024;
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {
+  TORCH_INTERNAL_ASSERT(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
+  constexpr int64_t max_int = std::numeric_limits<int>::max();
+  // Round up division for positive number that cannot cause integer overflow
+  auto block_num = (N - 1) / max_threads_per_block + 1;
+  TORCH_INTERNAL_ASSERT(block_num <= max_int, "Can't schedule too many blocks on CUDA device");
+  return static_cast<int>(block_num);
+}
+}  // namespace at::cuda::detail
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h ADDED Viewed

	@@ -0,0 +1,16 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/detail/CUDAHooksInterface.h>
+namespace at::cuda {
+// Forward-declares at::cuda::NVRTC
+struct NVRTC;
+namespace detail {
+extern NVRTC lazyNVRTC;
+} // namespace detail
+}  // namespace at::cuda
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh ADDED Viewed

	@@ -0,0 +1,141 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <array>
+#include <cstdint>
+#include <type_traits>
+#include <c10/macros/Macros.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/cuda/detail/IntegerDivider.cuh>
+// If element_sizes is nullptr, then the strides will be in bytes, otherwise
+// the strides will be in # of elements.
+// Operands that share the same shape, but may have different strides.
+// OffsetCalculator iterates the tensor in a column-major order
+#if defined(USE_ROCM)
+constexpr int MAX_DIMS = 16;
+#else
+constexpr int MAX_DIMS = 25;
+#endif
+template <int NARGS, typename index_t = uint32_t, bool signed_strides = false>
+struct OffsetCalculator {
+  // We allow having negative strides to implement some operations like torch.flip
+  using stride_t = std::conditional_t<signed_strides,
+                                      std::make_signed_t<index_t>,
+                                      index_t>;
+  // The offset for each argument. Wrapper around fixed-size array.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = std::array<stride_t, std::max<int>(NARGS, 1)>;
+  // if element_sizes is nullptr, then the strides will be in bytes, otherwise
+  // the strides will be in # of elements.
+  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes=nullptr) : dims(dims) {
+    TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims");
+    for (int i=0; i < dims; i++){
+      sizes_[i] = at::cuda::detail::IntDivider<index_t>(sizes[i]);
+      for (int arg = 0; arg < NARGS; arg++) {
+        int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]);
+        strides_[i][arg] = strides[arg][i] / element_size;
+      }
+    }
+  }
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+#if defined(USE_ROCM)
+    if ((dims > 0) && (dims <= 2)) {
+      auto divmod = sizes_[0].divmod(linear_idx);
+#pragma unroll
+      for (int arg = 0; arg < NARGS; arg++)
+        offsets[arg] = divmod.mod * strides_[0][arg];
+      if (dims >= 2) {
+        divmod = sizes_[1].divmod(divmod.div);
+#pragma unroll
+        for (int arg = 0; arg < NARGS; arg++)
+          offsets[arg] += divmod.mod * strides_[1][arg];
+      }
+      // [...]
+      return offsets;
+    }
+#endif
+    #pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = 0;
+    }
+    #pragma unroll
+    for (int dim = 0; dim < MAX_DIMS; ++dim) {
+      if (dim == dims) {
+        break;
+      }
+      auto divmod = sizes_[dim].divmod(linear_idx);
+      linear_idx = divmod.div;
+      #pragma unroll
+      for (int arg = 0; arg < NARGS; arg++) {
+        offsets[arg] += divmod.mod * strides_[dim][arg];
+      }
+    }
+    return offsets;
+  }
+  int dims;
+  at::cuda::detail::IntDivider<index_t> sizes_[MAX_DIMS];
+  stride_t strides_[MAX_DIMS][std::max<int>(NARGS, 1)];
+};
+template <int NARGS, typename index_t = uint32_t>
+struct TrivialOffsetCalculator {
+  // The offset for each argument. Wrapper around fixed-size array.
+  // The offsets are in # of elements, not in bytes.
+  // On CUDA, zero sized array is not allowed, so when we are handling nullary
+  // operators, we need to create a size 1 offset to avoid compiler failure.
+  // This size 1 offset is just a placeholder, and we will not use it.
+  using offset_type = std::array<index_t, std::max<int>(NARGS, 1)>;
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+    #pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = linear_idx;
+    }
+    return offsets;
+  }
+};
+// Make an OffsetCalculator with byte offsets
+template<int N, bool signed_strides = false>
+static OffsetCalculator<N, uint32_t, signed_strides> make_offset_calculator(const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(N <= iter.ntensors());
+  std::array<const int64_t*, N> strides;
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i).data();
+  }
+  return OffsetCalculator<N, uint32_t, signed_strides>(iter.ndim(), iter.shape().data(), strides.data());
+}
+// Make an OffsetCalculator with element offsets
+template<int N, bool signed_strides = false>
+static OffsetCalculator<N, uint32_t, signed_strides> make_element_offset_calculator(
+    const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(N <= iter.ntensors());
+  std::array<const int64_t*, N> strides;
+  std::array<int64_t, N> element_sizes;
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i).data();
+    element_sizes[i] = iter.element_size(i);
+  }
+  return OffsetCalculator<N, uint32_t, signed_strides>(
+      iter.ndim(), iter.shape().data(), strides.data(), element_sizes.data());
+}
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh ADDED Viewed

	@@ -0,0 +1,48 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
+// Eager mode clients should not include this file directly, instead,
+// they should #include <ATen/cuda/PhiloxCudaState.h>, which has a #pragma once.
+// Stores RNG state values. Passed as a kernel argument.
+// See Note [CUDA Graph-safe RNG states].
+//
+// The raw definition lives in its own file so jit codegen can easily copy it.
+namespace at {
+struct PhiloxCudaState {
+  PhiloxCudaState() = default;
+  // Called if graph capture is not underway
+  PhiloxCudaState(uint64_t seed,
+                  uint64_t offset) {
+    seed_.val = seed;
+    offset_.val = offset;
+  }
+  // Called if graph capture is underway
+  PhiloxCudaState(int64_t* seed,
+                  int64_t* offset_extragraph,
+                  uint64_t offset_intragraph) {
+    seed_.ptr = seed;
+    offset_.ptr = offset_extragraph;
+    offset_intragraph_ = offset_intragraph;
+    captured_ = true;
+  }
+  // Public members, directly accessible by at::cuda::philox::unpack.
+  // If we made them private with getters/setters, the getters/setters
+  // would have to be __device__, and we can't declare __device__ in ATen.
+  union Payload {
+    uint64_t val;
+    int64_t* ptr;
+  };
+  Payload seed_{};
+  Payload offset_{};
+  uint64_t offset_intragraph_ = 0;
+  bool captured_ = false;
+};
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh ADDED Viewed

	@@ -0,0 +1,121 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/CollapseDims.h>
+namespace at::cuda::detail {
+#define MAX_TENSORINFO_DIMS 25
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  TensorInfo();
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_TENSORINFO_DIMS],
+             IndexType st[MAX_TENSORINFO_DIMS]);
+  // Set the size of the given dimension to 1, as if it were a
+  // reduction dim (allows you to calculate offsets of the reduction
+  // slice)
+  void reduceDim(int dim);
+  // See note on [collapse dims].
+  int collapseDims(const int excludeDim = -1);
+  // Contiguous tensors of more than one dimension are collapsed down
+  // to one tensor
+  __host__ __device__ inline bool isContiguous() const {
+    return (dims == 1 && strides[0] == 1);
+  }
+  T* data;
+  IndexType sizes[MAX_TENSORINFO_DIMS];
+  IndexType strides[MAX_TENSORINFO_DIMS];
+  int dims;
+};
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo() {
+  data = nullptr;
+  dims = 0;
+}
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_TENSORINFO_DIMS],
+                                     IndexType st[MAX_TENSORINFO_DIMS]) {
+  data = p;
+  dims = dim;
+  TORCH_CHECK(dims < MAX_TENSORINFO_DIMS, "CUDA Tensors cannot have more than 25 dimensions");
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+  TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1");
+  sizes[dim] = 1;
+}
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
+  auto result = at::collapse_dims(sizes, strides, dims, excludeDim);
+  dims = std::get<1>(result);
+  return std::get<0>(result);
+}
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+    IndexType offset = 0;
+    // Uses static dims
+    for (int i = Dims - 1; i > 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+      linearId /= info.sizes[i];
+    }
+    return offset + linearId * info.strides[0];
+  }
+};
+// Uses dynamic (runtime) instead of static (compile time) dims
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static inline __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+      IndexType offset = 0;
+      for (int i = info.dims - 1; i > 0; --i) {
+        IndexType curDimIndex = linearId % info.sizes[i];
+        IndexType curDimOffset = curDimIndex * info.strides[i];
+        offset += curDimOffset;
+        linearId /= info.sizes[i];
+      }
+      return offset + linearId * info.strides[0];
+  }
+};
+} // namespace at::cuda::detail
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh ADDED Viewed

	@@ -0,0 +1,39 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
+// Eager mode clients should not include this file directly, instead,
+// they should #include <ATen/cuda/PhiloxUtils.cuh>, which has a #pragma once.
+namespace at::cuda::philox {
+// In-kernel call to retrieve philox seed and offset from a PhiloxCudaState instance whether
+// that instance was created with graph capture underway or not.
+// See Note [CUDA Graph-safe RNG states].
+//
+// We can't write a __device__ function in CUDAGeneratorImpl.h, because it's in ATen.
+// Also, whatever call unpacks PhiloxCudaState in consumer kernels must be inlineable.
+// Easiest thing that comes to mind is, define a __device__ unpack helper here, in ATen/cuda.
+//
+// The raw definition lives in its own file so jit codegen can easily copy it.
+__host__ __device__ __forceinline__ std::tuple<uint64_t, uint64_t>
+unpack(at::PhiloxCudaState arg) {
+  if (arg.captured_) {
+    // static_cast avoids "warning: invalid narrowing conversion from "long" to "unsigned long".
+    // *(arg.offset_.ptr) is a broadcast load of a single int64_t to the entire kernel.
+    // For most threads' reads it will hit in cache, so it shouldn't hurt performance.
+    return std::make_tuple(static_cast<uint64_t>(*arg.seed_.ptr), static_cast<uint64_t>(*(arg.offset_.ptr) + arg.offset_intragraph_));
+  } else {
+    return std::make_tuple(arg.seed_.val, arg.offset_.val);
+  }
+}
+// Adapted from TE
+// extract seed and offset from PhiloxCudaState
+__global__ void unpack_cudnn(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr);
+void unpack_cudnn_wrapper(at::PhiloxCudaState arg, int64_t* seed_ptr, int64_t* offset_ptr, cudaStream_t stream);
+} // namespace at::cuda::philox
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h ADDED Viewed

	@@ -0,0 +1,705 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <string>
+#include <c10/core/ScalarType.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/Tunable.h>
+#include <ATen/cuda/CUDABlas.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/util/StringUtil.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/allclose.h>
+#include <ATen/ops/from_blob.h>
+#endif
+#include <ATen/OpMathType.h>
+#include <fmt/printf.h>
+namespace at::cuda::tunable {
+using at::blas::ScalingType;
+enum class BlasOp {
+  N = 0,
+  T = 1
+};
+inline char BlasOpToString(BlasOp op) {
+  switch (op) {
+    case BlasOp::N:
+      return 'N';
+    case BlasOp::T:
+      return 'T';
+  }
+  TORCH_CHECK(false, "unrecognized BlasOp");
+  return 'N';
+}
+template <typename T>
+inline const char* BLASTypeName(T v) {
+  return "unknown";
+}
+template <>
+inline const char* BLASTypeName(float v) {
+  return "f32_r";
+}
+template <>
+inline const char* BLASTypeName(double v) {
+  return "f64_r";
+}
+template <>
+inline const char* BLASTypeName(BFloat16 v) {
+  return "bf16_r";
+}
+template <>
+inline const char* BLASTypeName(Half v) {
+  return "f16_r";
+}
+//https://github.com/ROCm/hipBLASLt/blob/develop/library/src/include/auxiliary.hpp#L175
+template <>
+inline const char* BLASTypeName(Float8_e4m3fn v) {
+  return "f8_r";
+}
+template <>
+inline const char* BLASTypeName(Float8_e5m2 v) {
+  return "bf8_r";
+}
+template <>
+inline const char* BLASTypeName(Float8_e4m3fnuz v) {
+  return "f8_fnuz_r";
+}
+template <>
+inline const char* BLASTypeName(Float8_e5m2fnuz v) {
+  return "bf8_fnuz_r";
+}
+template <>
+inline const char* BLASTypeName(c10::complex<double> v) {
+  return "f64_r";
+}
+template <>
+inline const char* BLASTypeName(c10::complex<float> v) {
+  return "f32_r";
+}
+inline std::string ScalarTypeToBLASType(c10::ScalarType scalar_type) {
+  std::string BLASType;
+  switch (scalar_type) {
+    case c10::ScalarType::Float:{
+      BLASType = "f32_r";
+      break;
+    }
+    case c10::ScalarType::Double:{
+      BLASType = "f64_r";
+      break;
+    }
+    case c10::ScalarType::BFloat16:{
+      BLASType = "bf16_r";
+      break;
+    }
+    case c10::ScalarType::Half: {
+      BLASType = "f16_r";
+      break;
+    }
+    case c10::ScalarType::Float8_e4m3fn: {
+      BLASType = "f8_r";
+      break;
+    }
+    case c10::ScalarType::Float8_e5m2: {
+      BLASType = "bf8_r";
+      break;
+    }
+    case c10::ScalarType::Float8_e4m3fnuz: {
+      BLASType = "f8_fnuz_r";
+      break;
+    }
+    case c10::ScalarType::Float8_e5m2fnuz: {
+      BLASType = "bf8_fnuz_r";
+      break;
+    }
+    case c10::ScalarType::ComplexFloat:{
+      BLASType = "f32_c";
+      break;
+    }
+    case c10::ScalarType::ComplexDouble:{
+      BLASType = "f64_c";
+      break;
+    }
+    default:
+      BLASType = "unknown";
+  }
+  return BLASType;
+}
+// Similar to Compute Type in GemmRocblas.h
+template <typename T>
+inline std::string ComputeTypeFor() {
+  return "Unknown ComputeType";
+}
+// This is a union of the compute types for
+// ROCBLAS and hipBLASLt.
+template <>
+inline std::string ComputeTypeFor<float>() {
+  if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) != at::Float32Precision::TF32) {
+    return "f32_r";
+  } else {
+    return "xf32_r";
+  }
+}
+template <>
+inline std::string ComputeTypeFor<double>() {
+  return "f64_r";
+}
+template <>
+inline std::string ComputeTypeFor<Half>() {
+  return "f32_r";
+}
+template <>
+inline std::string ComputeTypeFor<BFloat16>() {
+  return "f32_r";
+}
+template <>
+inline std::string ComputeTypeFor<c10::complex<float>>() {
+  return "f32_c";
+}
+template <>
+inline std::string ComputeTypeFor<c10::complex<double>>() {
+  return "f64_c";
+}
+template <>
+inline std::string ComputeTypeFor<Float8_e4m3fn>() {
+  return "f32_r";
+}
+template <>
+inline std::string ComputeTypeFor<Float8_e5m2>() {
+  return "f32_r";
+}
+template <>
+inline std::string ComputeTypeFor<Float8_e4m3fnuz>() {
+  return "f32_r";
+}
+template <>
+inline std::string ComputeTypeFor<Float8_e5m2fnuz>() {
+  return "f32_r";
+}
+// Convert opmath_type<T> to string
+template <typename T>
+inline std::string to_string_opmath(const at::opmath_type<T>& value) {
+    if constexpr (std::is_same_v<at::opmath_type<T>, c10::complex<float>> ||
+                  std::is_same_v<at::opmath_type<T>, c10::complex<double>>) {
+        return fmt::format("({:.4f}, {:.4f})", value.real(), value.imag());
+    } else {
+        return fmt::format("{:.4f}", value);
+    }
+}
+// convert activation epilogue to string
+inline std::string to_string_epilogue(const at::cuda::blas::GEMMAndBiasActivationEpilogue& value) {
+  switch (value) {
+    case at::cuda::blas::GEMMAndBiasActivationEpilogue::None:
+      return std::string("None");
+      break;
+    case at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU:
+      return std::string("RELU");
+      break;
+    case cuda::blas::GEMMAndBiasActivationEpilogue::GELU:
+      return std::string("GELU");
+      break;
+    default:
+      return std::string("unknown");
+  }
+}
+namespace detail {
+static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size, const NumericalCheckConfig& config) {
+  if (!config.enabled) {
+    return true; // skip when disabled
+  }
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
+  at::Tensor ref = at::from_blob(c,       {size}, options);
+  at::Tensor oth = at::from_blob(other_c, {size}, options);
+  at::Tensor ref_float = ref.to(at::kFloat);
+  at::Tensor oth_float = oth.to(at::kFloat);
+  const bool ok = at::allclose(ref_float, oth_float, config.rtol, config.atol);
+  if (ok) {
+    TUNABLE_LOG3("├──verify numerics: PASSED with atol=", config.atol, ", rtol=", config.rtol);
+  } else {
+    TUNABLE_LOG3("├──verify numerics: FAILED with atol=", config.atol, ", rtol=", config.rtol);
+  }
+  return ok;
+}
+}
+// Note on GetSizeA et al.
+// Tensors can be dense or arbitrarily strided. We only need our copies to be large enough.
+// Our copies must be at least as large as the m n k shapes dictate, but could be larger
+// depending on the lda ldb ldc values. Similarly for the batched case.
+template <typename T>
+struct GemmParams : OpParams {
+  GemmParams() = default;
+  std::string BLASSignature() const override {
+    std::string alpha_str = to_string_opmath<T>(alpha);
+    std::string beta_str = to_string_opmath<T>(beta);
+    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
+      "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: 1, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, bias_type: %s, compute_type: %s }",
+      m, n, k, lda, ldb, ldc, ldc, alpha_str, beta_str, transa, transb,
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+  }
+  std::string Signature() const override {
+    return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, lda, ldb, ldc);
+  }
+  size_t GetSizeA() const {
+    size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m);
+    size_t size_dense = m * k;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSizeB() const {
+    size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k);
+    size_t size_dense = k * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSizeC() const {
+    size_t size_stride = ldc * n;
+    size_t size_dense = m * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+  GemmParams* DeepCopy(bool duplicate_inputs) const {
+    GemmParams* copy = new GemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+  TuningStatus NumericalCheck(GemmParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+  }
+  char transa{};
+  char transb{};
+  int64_t m{};
+  int64_t n{};
+  int64_t k{};
+  at::opmath_type<T> alpha;
+  const T* a{};
+  int64_t lda{};
+  const T* b{};
+  int64_t ldb{};
+  at::opmath_type<T> beta;
+  T* c{};
+  int64_t ldc{};
+private:
+  bool duplicate_inputs_{false};
+};
+template <typename T>
+struct GemmAndBiasParams : OpParams {
+  std::string BLASSignature() const override {
+    std::string alpha_str = to_string_opmath<T>(alpha);
+    std::string activation_str = to_string_epilogue(activation);
+    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
+      "alpha: %s, transA: %c, transB: %c, batch_count: 1, a_type: %s, b_type: %s, c_type: %s, d_type: %s, activation: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
+      m, n, k, lda, ldb, ldc, ldc, alpha_str, transa, transb,
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), activation_str, BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+  }
+  std::string Signature() const override {
+    return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, lda, ldb, ldc);
+  }
+  size_t GetSizeA() const {
+    size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m);
+    size_t size_dense = m * k;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSizeB() const {
+    size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k);
+    size_t size_dense = k * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSizeC() const {
+    size_t size_stride = ldc * n;
+    size_t size_dense = m * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+  GemmAndBiasParams* DeepCopy(bool duplicate_inputs) const {
+    GemmAndBiasParams* copy = new GemmAndBiasParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      // NOLINTNEXTLINE(*const-cast)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+  TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+  }
+  char transa{};
+  char transb{};
+  int64_t m{};
+  int64_t n{};
+  int64_t k{};
+  at::opmath_type<T> alpha{};
+  const T* a{};
+  int64_t lda{};
+  const T* b{};
+  int64_t ldb{};
+  T* c{};
+  int64_t ldc{};
+  const T* bias{};
+  at::cuda::blas::GEMMAndBiasActivationEpilogue activation{};
+private:
+  bool duplicate_inputs_{false};
+};
+template <typename T, typename C_Dtype = T>
+struct GemmStridedBatchedParams : OpParams {
+  std::string BLASSignature() const override {
+    std::string alpha_str = to_string_opmath<T>(alpha);
+    std::string beta_str = to_string_opmath<T>(beta);
+    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: %ld, stride_b: %ld, stride_c: %ld, stride_d: %ld, "
+      "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: %ld, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
+      m, n, k, lda, ldb, ldc, ldc, stride_a, stride_b, stride_c, stride_c, alpha_str, beta_str, transa, transb, batch,
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<C_Dtype>(C_Dtype{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+  }
+  std::string Signature() const override {
+    return fmt::sprintf("%c%c_%ld_%ld_%ld_B_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, batch, lda, ldb, ldc);
+  }
+  size_t GetSizeA() const {
+    size_t size_stride = stride_a * batch;
+    size_t size_dense = m * k * batch;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSizeB() const {
+    size_t size_stride = stride_b * batch;
+    size_t size_dense = k * n * batch;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSizeC() const {
+    size_t size_stride = stride_c * batch;
+    size_t size_dense = m * n * batch;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+  GemmStridedBatchedParams* DeepCopy(bool duplicate_inputs) const {
+    GemmStridedBatchedParams* copy = new GemmStridedBatchedParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = static_cast<C_Dtype*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      // NOLINTNEXTLINE(*const-cast*)
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      // NOLINTNEXTLINE(*const-cast*)
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+  }
+  char transa{};
+  char transb{};
+  int64_t m{};
+  int64_t n{};
+  int64_t k{};
+  at::opmath_type<T> alpha{};
+  const T* a{};
+  int64_t lda{};
+  int64_t stride_a{};
+  const T* b{};
+  int64_t ldb{};
+  int64_t stride_b{};
+  at::opmath_type<T> beta;
+  C_Dtype* c{};
+  int64_t ldc{};
+  int64_t stride_c{};
+  int64_t batch{};
+private:
+  bool duplicate_inputs_{false};
+};
+template <typename T>
+struct ScaledGemmParams : OpParams {
+  ScaledGemmParams() = default;
+  std::string BLASSignature() const override {
+    // Excluding use_fast_accum and use_rowise booleans for now
+    if (bias_ptr == nullptr) {
+      return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
+        "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
+        m, n, k, lda, ldb, ldc, ldc, transa, transb,
+        ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype),
+        ComputeTypeFor<T>(), ComputeTypeFor<T>());
+    }
+    else {
+      return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
+        "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
+        m, n, k, lda, ldb, ldc, ldc, transa, transb,
+        ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype),
+        ComputeTypeFor<T>(), ComputeTypeFor<T>());
+    }
+  }
+  std::string Signature() const override {
+    // In Blas.cpp, code defaults to a bias_dtype of Half even when there is no bias vector.
+    // Search for this line::
+    // params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_;
+    //
+    // In TunableOp, we must distinguish in param signature these two cases: with and without a bias vector.
+    return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld_rw_%d_bias_%s",
+      transa, transb, m, n, k, lda, ldb, ldc,
+      a_scaling_type == ScalingType::RowWise && b_scaling_type == ScalingType::RowWise,
+      bias_ptr == nullptr ? "None" : at::toString(bias_dtype));
+  }
+  size_t GetSizeA() const {
+    size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m);
+    size_t size_dense = m * k;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSizeB() const {
+    size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k);
+    size_t size_dense = k * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSizeC() const {
+    size_t size_stride = ldc * n;
+    size_t size_dense = m * n;
+    return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
+  }
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+  ScaledGemmParams* DeepCopy(bool duplicate_inputs) const {
+    ScaledGemmParams* copy = new ScaledGemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = c10::cuda::CUDACachingAllocator::raw_alloc(c_size);
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      copy->a = c10::cuda::CUDACachingAllocator::raw_alloc(a_size);
+      copy->b = c10::cuda::CUDACachingAllocator::raw_alloc(b_size);
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<void*>(a));
+      // NOLINTNEXTLINE(*const-cast*)
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<void*>(b));
+    }
+  }
+  TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
+    auto* ctx = getTuningContext();
+    auto cfg = ctx->GetNumericalCheckConfig();
+    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
+  }
+  char transa{};
+  char transb{};
+  int64_t m{};
+  int64_t n{};
+  int64_t k{};
+  const void* a{};
+  const void* a_scale_ptr{};
+  int64_t lda{};
+  ScalarType a_dtype{};
+  ScalarType a_scale_dtype{};
+  ScalingType a_scaling_type{};
+  const void* b{};
+  const void* b_scale_ptr{};
+  int64_t ldb{};
+  ScalarType b_dtype{};
+  ScalarType b_scale_dtype{};
+  ScalingType b_scaling_type{};
+  const void* bias_ptr{};
+  ScalarType bias_dtype{};
+  void* c{};
+  const void* c_scale_ptr{};
+  int64_t ldc{};
+  ScalarType c_dtype{};
+  void* amax_ptr{};
+  bool use_fast_accum{};
+private:
+  bool duplicate_inputs_{false};
+};
+} // namespace at::cuda::tunable
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h ADDED Viewed

	@@ -0,0 +1,692 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDADataType.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/StringUtil.h>
+#include <fmt/printf.h>
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+#define TORCH_HIPBLASLT_CHECK(EXPR)               \
+  do {                                            \
+    hipblasStatus_t __err = EXPR;                 \
+    TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS,  \
+                "hipblaslt error: ",              \
+                hipblasStatusToString(__err),     \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+namespace at::cuda::tunable {
+template <typename T>
+constexpr hipDataType HipDataTypeFor();
+template <>
+constexpr hipDataType HipDataTypeFor<float>() {
+  return HIP_R_32F;
+}
+template <>
+constexpr hipDataType HipDataTypeFor<Half>() {
+  return HIP_R_16F;
+}
+template <>
+constexpr hipDataType HipDataTypeFor<BFloat16>() {
+  return HIP_R_16BF;
+}
+template <>
+constexpr hipDataType HipDataTypeFor<double>() {
+  return HIP_R_64F;
+}
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e4m3fnuz>() {
+  return HIP_R_8F_E4M3_FNUZ;
+}
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e5m2fnuz>() {
+  return HIP_R_8F_E5M2_FNUZ;
+}
+// This code is instantiated regardless of ROCm version.
+// Prior to ROCm 6.3, we hard-code the known enum values.
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e4m3fn>() {
+#if ROCM_VERSION >= 60300
+  return HIP_R_8F_E4M3;
+#else
+  return static_cast<hipDataType>(28);
+#endif
+}
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e5m2>() {
+#if ROCM_VERSION >= 60300
+  return HIP_R_8F_E5M2;
+#else
+  return static_cast<hipDataType>(29);
+#endif
+}
+// This type is not intended for matrix types but rather a scale factor.
+// Return a dummy value to satisfy linker.
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float8_e8m0fnu>() {
+  return static_cast<hipDataType>(500);
+}
+template <>
+constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
+#if ROCM_VERSION >= 70000
+  return HIP_R_4F_E2M1;
+#else
+  return static_cast<hipDataType>(33);
+#endif
+}
+template <typename T>
+int GetBatchFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetBatchFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetBatchFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->batch;
+}
+template <typename T>
+int GetBatchFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideAFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideAFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideAFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_a;
+}
+template <typename T>
+int GetStrideAFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideBFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideBFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideBFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_b;
+}
+template <typename T>
+int GetStrideBFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideCFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideCFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideCFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_c;
+}
+template <typename T>
+int GetStrideCFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+float GetAlphaFromParams(const GemmParams<T>* params) {
+  return params->alpha;
+}
+template <typename T>
+float GetAlphaFromParams(const GemmAndBiasParams<T>* params) {
+  return params->alpha;
+}
+template <typename T>
+float GetAlphaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->alpha;
+}
+template <typename T>
+float GetAlphaFromParams(const ScaledGemmParams<T>* params) {
+  return 1.0;
+}
+template <typename T>
+float GetBetaFromParams(const GemmParams<T>* params) {
+  return params->beta;
+}
+template <typename T>
+float GetBetaFromParams(const GemmAndBiasParams<T>* params) {
+  return 0.0;
+}
+template <typename T>
+float GetBetaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->beta;
+}
+template <typename T>
+float GetBetaFromParams(const ScaledGemmParams<T>* params) {
+  return 0.0;
+}
+template <typename T>
+ScalingType GetAScalingTypeFromParams(const GemmParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+template <typename T>
+ScalingType GetBScalingTypeFromParams(const GemmParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+template <typename T>
+ScalingType GetAScalingTypeFromParams(const GemmAndBiasParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+template <typename T>
+ScalingType GetBScalingTypeFromParams(const GemmAndBiasParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+template <typename T>
+ScalingType GetAScalingTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+template <typename T>
+ScalingType GetBScalingTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return ScalingType::TensorWise;
+}
+template <typename T>
+ScalingType GetAScalingTypeFromParams(const ScaledGemmParams<T>* params) {
+  return params->a_scaling_type;
+}
+template <typename T>
+ScalingType GetBScalingTypeFromParams(const ScaledGemmParams<T>* params) {
+  return params->b_scaling_type;
+}
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetAScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->a_scale_ptr;
+}
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->b_scale_ptr;
+}
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetDScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->c_scale_ptr;
+}
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmAndBiasParams<T>* params) {
+  return params->bias;
+}
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBiasPointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->bias_ptr;
+}
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmParams<T>* params) {
+  return HIP_R_32F;
+}
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmAndBiasParams<T>* params) {
+  return HipDataTypeFor<T>();
+}
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return HIP_R_32F;
+}
+template <typename T>
+hipDataType GetBiasTypeFromParams(const ScaledGemmParams<T>* params) {
+  return at::cuda::ScalarTypeToCudaDataType(params->bias_dtype);
+}
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmAndBiasParams<T>* params) {
+  return params->activation;
+}
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmStridedBatchedParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const ScaledGemmParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+static hipblasOperation_t _hipblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return HIPBLAS_OP_N;
+    case 't':
+    case 'T':
+      return HIPBLAS_OP_T;
+    case 'c':
+    case 'C':
+      return HIPBLAS_OP_C;
+  }
+  TORCH_CHECK(false,
+      "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+static char _charFromhipblasOp(hipblasOperation_t op) {
+  switch (op) {
+    case HIPBLAS_OP_N:
+      return 'N';
+    case HIPBLAS_OP_T:
+      return 'T';
+    case HIPBLAS_OP_C:
+      return 'C';
+  }
+  TORCH_CHECK(false,
+      "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`");
+}
+static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) {
+  if (layout == BlasOp::N) {
+    return HIPBLAS_OP_N;
+  }
+  return HIPBLAS_OP_T;
+}
+template <typename T, cublasStatus_t (*destructor)(T*)>
+struct HipBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDABLAS_CHECK(destructor(x));
+    }
+  }
+};
+template <typename T, hipblasStatus_t (*destructor)(T*)>
+class HipBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+ protected:
+  std::unique_ptr<T, HipBlasLtDeleter<T, destructor>> descriptor_;
+};
+class HipBlasLtMatmulDescriptor : public HipBlasLtDescriptor<
+                                     hipblasLtMatmulDescOpaque_t,
+                                     &hipblasLtMatmulDescDestroy> {
+ public:
+  HipBlasLtMatmulDescriptor(
+      hipblasComputeType_t compute_type,
+      hipDataType scale_type) {
+    hipblasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_HIPBLASLT_CHECK(
+        hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) {
+    TORCH_HIPBLASLT_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+class HipblasltGemmOp : public Callable<ParamsT> {
+  public:
+    HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {}
+    TuningStatus Call(const ParamsT* params) override {
+      hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+      hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+      auto a_datatype = HipDataTypeFor<AT>();
+      auto b_datatype = HipDataTypeFor<BT>();
+      auto in_out_datatype = HipDataTypeFor<CT>();
+      auto opa = _hipblasOpFromChar(params->transa);
+      auto opb = _hipblasOpFromChar(params->transb);
+      TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen");
+      float alpha = GetAlphaFromParams<CT>(params);
+      float beta = GetBetaFromParams<CT>(params);
+      hipblasLtMatrixLayout_t mat_a, mat_b, mat_c;
+      if (opa == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->m, params->k, params->lda));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->k, params->m, params->lda));
+      }
+      if (opb == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->k, params->n, params->ldb));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->n, params->k, params->ldb));
+      }
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc));
+      // specific to batched gemmm
+      int batch = GetBatchFromParams<CT>(params);
+      if (batch > 1) {
+        int64_t stride_a = GetStrideAFromParams<CT>(params);
+        int64_t stride_b = GetStrideBFromParams<CT>(params);
+        int64_t stride_c = GetStrideCFromParams<CT>(params);
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
+      }
+      hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32) {
+        computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
+      }
+      HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
+      // specific to scaled gemm
+      const void* mat1_scale_ptr = GetAScalePointerFromParams<CT>(params);
+      const void* mat2_scale_ptr = GetBScalePointerFromParams<CT>(params);
+      const void* result_scale_ptr = GetDScalePointerFromParams<CT>(params);
+      if (mat1_scale_ptr && mat2_scale_ptr) {
+        hipblasLtMatmulDescAttributes_t a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER;
+        hipblasLtMatmulDescAttributes_t b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER;
+        if (GetAScalingTypeFromParams<CT>(params) == ScalingType::RowWise) {
+#if defined(HIPBLASLT_OUTER_VEC)
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
+#elif defined(HIPBLASLT_VEC_EXT)
+          a_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
+#endif
+        }
+        if (GetBScalingTypeFromParams<CT>(params) == ScalingType::RowWise) {
+#if defined(HIPBLASLT_OUTER_VEC)
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_MODE, HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F);
+#elif defined(HIPBLASLT_VEC_EXT)
+          b_scale_ptr_desc = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
+#endif
+        }
+        matmul.setAttribute(a_scale_ptr_desc, mat1_scale_ptr);
+        matmul.setAttribute(b_scale_ptr_desc, mat2_scale_ptr);
+      }
+      if (result_scale_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+      }
+      const void* bias_ptr = GetBiasPointerFromParams<CT>(params);
+      auto bias_datatype = GetBiasTypeFromParams<CT>(params);
+      if (bias_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype);
+        auto activation = GetActivationFromParams<CT>(params);
+        if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_RELU_BIAS);
+        }
+        else if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::GELU) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_GELU_BIAS);
+        }
+        else {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS);
+        }
+      }
+      size_t workspace_size = at::cuda::getCUDABlasLtWorkspaceSize();
+      auto op_handle = at::cuda::getCurrentCUDABlasLtHandle();
+      size_t ret_workspace_size = 0;
+      auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle,
+          matmul.descriptor(),
+          &alpha,
+          mat_a,
+          mat_b,
+          &beta,
+          mat_c,
+          mat_c,
+          algo_,
+          ret_workspace_size);
+      if (status == HIPBLAS_STATUS_SUCCESS) {
+        if (ret_workspace_size >= workspace_size) {
+          return FAIL;
+        }
+      }
+      else {
+        return FAIL;
+      }
+      void* workspace_buffer = at::cuda::getCUDABlasLtWorkspace();
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
+            matmul.descriptor(),
+            &alpha,
+            params->a,
+            mat_a,
+            params->b,
+            mat_b,
+            &beta,
+            params->c,
+            mat_c,
+            params->c,
+            mat_c,
+            &algo_,
+            workspace_buffer,
+            workspace_size,
+            at::cuda::getCurrentCUDAStream()));
+      //TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
+      return OK;
+    }
+  private:
+    hipblasLtMatmulAlgo_t algo_;
+};
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+auto GetHipBlasLtTypeStringAndOps() {
+  hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+  hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+  auto a_datatype = HipDataTypeFor<AT>();
+  auto b_datatype = HipDataTypeFor<BT>();
+  auto in_out_datatype = HipDataTypeFor<CT>();
+  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+#if ROCM_VERSION == 60400
+  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
+  if ((a_datatype == HIP_R_32F || b_datatype == HIP_R_32F || in_out_datatype == HIP_R_32F)
+          && (transa_outer == HIPBLAS_OP_T && transb_outer == HIPBLAS_OP_T)) {
+    std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ignore;
+    return ignore;
+  }
+#endif
+  hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
+  if (at::globalContext().allowTF32CuBLAS()) {
+    computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
+  }
+  hipblasLtHandle_t handle;
+  TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
+  TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
+        hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+        transa_outer,
+        transb_outer,
+        a_datatype,
+        b_datatype,
+        in_out_datatype,
+        in_out_datatype,
+        computeType,
+        heuristic_result));
+  TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));
+  int returned_algo_count = heuristic_result.size();
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ret;
+  for (int i = 0; i < returned_algo_count; i++) {
+    auto algo = heuristic_result[i].algo;
+    int algo_index = hipblaslt_ext::getIndexFromAlgo(algo);
+    auto callable = std::make_unique<HipblasltGemmOp<AT, BT, CT, ALayout, BLayout, ParamsT>>(algo);
+    std::string type_string = fmt::sprintf("Gemm_Hipblaslt_%d", algo_index);
+    ret.emplace_back(type_string, std::move(callable));
+  }
+  return ret;
+}
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmParams<T>>();
+}
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmAndBiasTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmAndBiasParams<T>>();
+}
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmStridedBatchedParams<T>>();
+}
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtScaledGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<AT, BT, CT, ALayout, BLayout, ScaledGemmParams<CT>>();
+}
+#undef TORCH_HIPBLASLT_CHECK
+}  // namespace at::cuda::tunable
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h ADDED Viewed

	@@ -0,0 +1,282 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/util/StringUtil.h>
+#include <fmt/printf.h>
+#define ROCBLAS_BETA_FEATURES_API
+#include <rocblas/rocblas.h>
+#define TORCH_ROCBLAS_CHECK(EXPR)                 \
+  do {                                            \
+    rocblas_status __err = EXPR;                  \
+    TORCH_CHECK(__err == rocblas_status_success,  \
+                "rocblas error: ",                \
+                rocblas_status_to_string(__err),  \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+namespace at::cuda::tunable {
+template <typename T>
+constexpr rocblas_datatype RocBlasDataTypeFor();
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<Half>() {
+  return rocblas_datatype_f16_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<BFloat16>() {
+  return rocblas_datatype_bf16_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+template <typename T>
+constexpr rocblas_datatype RocBlasComputeTypeFor();
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<Half>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // FP16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<BFloat16>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // BF16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+template <typename T>
+auto DoCastForHalfOrBfloat16(const T fp) {
+  return fp;
+}
+template <>
+inline auto DoCastForHalfOrBfloat16<Half>(const Half fp) {
+  // alpha and beta should be the same as compute_type, in Half case it is float.
+  float h = fp;
+  return h;
+}
+template <>
+inline auto DoCastForHalfOrBfloat16<BFloat16>(const BFloat16 fp) {
+  // alpha and beta should be the same as compute_type, in bfloat16 case it is float.
+  float h = fp;
+  return h;
+}
+static rocblas_operation _rocblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return rocblas_operation_none;
+    case 't':
+    case 'T':
+      return rocblas_operation_transpose;
+    case 'c':
+    case 'C':
+      return rocblas_operation_conjugate_transpose;
+  }
+  TORCH_CHECK(false,
+      "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+template <typename T>
+class RocblasGemmOp : public Callable<GemmParams<T>> {
+  public:
+    RocblasGemmOp(int solution) : solution_{solution} {}
+    TuningStatus Call(const GemmParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
+        return FAIL;  // no support for TF32 in rocBLAS
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda,
+          params->b, input_output_type, params->ldb,
+          &h_b,
+          params->c, input_output_type, params->ldc,
+          params->c, input_output_type, params->ldc,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+  private:
+    int solution_;
+};
+template <typename T>
+auto GetRocBlasGemmTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(fmt::sprintf("Gemm_Rocblas_%d", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+template <typename T>
+class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    RocblasGemmStridedBatchedOp(int solution) : solution_{solution} {}
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      if (at::globalContext().float32Precision(at::Float32Backend::CUDA, at::Float32Op::MATMUL) == at::Float32Precision::TF32 && input_output_type == rocblas_datatype_f32_r)
+        return FAIL;  // no support for TF32 in rocBLAS
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_strided_batched_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda, params->stride_a,
+          params->b, input_output_type, params->ldb, params->stride_b,
+          &h_b,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->batch,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+  private:
+    int solution_;
+};
+template <typename T>
+auto GetRocBlasGemmStridedBatchedTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmStridedBatchedParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmStridedBatchedOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+}  // namespace at::cuda::tunable
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h ADDED Viewed

	@@ -0,0 +1,55 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <cuda_runtime.h>
+#include <ATen/cuda/tunable/Tunable.h>
+namespace at::cuda::tunable {
+class StreamTimer : public ITimer {
+  public:
+    StreamTimer();
+    ~StreamTimer() override;
+    void Start() override;
+    void End() override;
+    float Duration() override;
+  private:
+    cudaEvent_t start_{};
+    cudaEvent_t end_{};
+};
+class StreamTimerNoSync : public ITimer {
+  public:
+    StreamTimerNoSync();
+    ~StreamTimerNoSync() override;
+    void Start() override;
+    void End() override;
+    float Duration() override;
+  private:
+    cudaEvent_t start_{};
+    cudaEvent_t end_{};
+};
+} // namespace at::cuda::tunable
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/Tunable.h ADDED Viewed

	@@ -0,0 +1,270 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <c10/util/CallOnce.h>
+#include <c10/util/StringUtil.h>
+#include <c10/util/env.h>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#define TUNABLE_LOGV(LEVEL, ...) getTuningContext()->Log(LEVEL, __VA_ARGS__)
+#define TUNABLE_LOG1(...) TUNABLE_LOGV(1, __VA_ARGS__)
+#define TUNABLE_LOG2(...) TUNABLE_LOGV(2, __VA_ARGS__)
+#define TUNABLE_LOG3(...) TUNABLE_LOGV(3, __VA_ARGS__)
+namespace at::cuda::tunable {
+enum TORCH_CUDA_CPP_API TuningStatus {
+  OK = 0,
+  FAIL = 1,
+  UNSUPPORTED = 2,
+};
+// Mapping from params signature to kernel id
+class TORCH_CUDA_CPP_API ResultEntry {
+  public:
+    explicit ResultEntry(std::string  key, double time) : key_(std::move(key)), time_(time) {}
+    explicit ResultEntry(std::string  key, double time, std::string blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(std::move(blas_sig)) {}
+    bool operator==(const ResultEntry& other) const { return key_ == other.key_; }
+    bool operator!=(const ResultEntry& other) const { return key_ != other.key_; }
+    operator std::string () { return key_; }
+    std::string GetKey() const { return key_; }
+    double GetTime() const { return time_; }
+    friend std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry);
+    static ResultEntry Null() { return ResultEntry("Null", 0.0); }
+    static ResultEntry Default() { return ResultEntry("Default", 0.0); }
+  private:
+    std::string key_;
+    double time_;
+    std::string blas_sig_;
+};
+typedef std::unordered_map<std::string, ResultEntry> KernelMap;
+typedef std::unordered_map<std::string, KernelMap> ResultsMap;
+typedef std::unordered_map<std::string, std::unordered_set<std::string>> UntunedMap;
+struct TORCH_CUDA_CPP_API TuningResults {
+  // Validates if these results are compatible with the libraries
+  std::unordered_map<std::string, std::string> validators;
+  // Mapping from Callable signature to Callable's tuning result
+  ResultsMap results;
+};
+class TORCH_CUDA_CPP_API TuningResultsManager {
+  public:
+    TuningResultsManager() = default;
+    ~TuningResultsManager() = default;
+    KernelMap Lookup(const std::string& op_signature);
+    ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature);
+    void AddImpl(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best,
+        KernelMap& kernel_map);
+    void Add(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best);
+    void Delete(const std::string& op_signature, const std::string& params_signature);
+    void DisjointMergeImpl(
+        const std::string& op_signature,
+        const KernelMap& kernel_map,
+        /*out*/ ResultsMap& results);
+    void Load(const ResultsMap& results_to_load);
+    ResultsMap Dump();
+    void DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map);
+    size_t GetSize();
+    void RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature,
+      const std::string& params_signature, const std::string& blas_signature);
+    void InitRealtimeAppend(
+        const std::string& filename,
+        const std::unordered_map<std::string, std::string>& validators);
+    void AppendResultLine(const std::string& op_sig,
+                         const std::string& param_sig,
+                         const ResultEntry& result);
+    void CloseRealtimeAppend();  // For clean shutdown
+  private:
+    std::mutex lock_;
+    std::mutex realtime_file_mutex_;
+    std::unique_ptr<std::ofstream> realtime_out_;
+    std::string realtime_filename_;
+    ResultsMap results_;
+    UntunedMap untuned_results_;
+    bool validators_written_ = false;
+};
+class TORCH_CUDA_CPP_API TuningResultsValidator {
+  public:
+    using GetFunc = std::function<std::string()>;
+    using ValidateFunc = std::function<TuningStatus(const std::string&)>;
+    using GetValidateFuncs = std::unordered_map<std::string, std::pair<GetFunc, ValidateFunc>>;
+    TuningResultsValidator();
+    ~TuningResultsValidator() = default;
+    std::unordered_map<std::string, std::string> GetAllValidators() const;
+    TuningStatus ValidateAll(const std::unordered_map<std::string, std::string>& to_validate) const;
+    void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf);
+  protected:
+    static std::string GetPyTorchVersion() ;
+    TuningStatus ValidatePyTorchVersion(const std::string& value) const;
+  public:
+    static constexpr const std::array mandatory_keys{"PT_VERSION"};
+  private:
+    GetValidateFuncs validators_;
+};
+struct NumericalCheckConfig {
+  bool   enabled{false};
+  double atol{1e-5};
+  double rtol{1e-5};
+  NumericalCheckConfig() = default;
+  NumericalCheckConfig(bool e, double a, double r) : enabled(e), atol(a), rtol(r) {}
+};
+class TORCH_CUDA_CPP_API TuningContext {
+  public:
+    TuningContext();
+    ~TuningContext();
+    TuningContext(TuningContext &) = delete;
+    TuningContext(TuningContext &&) = delete;
+    TuningContext &operator=(TuningContext &) = delete;
+    TuningContext &operator=(TuningContext &&) = delete;
+    void EnableTunableOp(bool value);
+    bool IsTunableOpEnabled() const;
+    void EnableTuning(bool value);
+    bool IsTuningEnabled() const;
+    void EnableRecordUntuned(bool value);
+    bool IsRecordUntunedEnabled() const;
+    std::ofstream& GetUntunedFile();
+    void EnableNumericsCheck(bool value);
+    bool IsNumericsCheckEnabled() const;
+    void SetNumericalCheckConfig(bool enabled, double atol, double rtol);
+    NumericalCheckConfig GetNumericalCheckConfig() const;
+    void SetMaxTuningDurationMs(int max_duration_ms);
+    int GetMaxTuningDurationMs() const;
+    void SetMaxTuningIterations(int max_iter);
+    int GetMaxTuningIterations() const;
+    void SetMaxWarmupDurationMs(int max_duration_ms);
+    int GetMaxWarmupDurationMs() const;
+    void SetMaxWarmupIterations(int max_iter);
+    int GetMaxWarmupIterations() const;
+    void EnableICacheFlush(bool value);
+    bool IsICacheFlushEnabled() const;
+    void SetRotatingBufferSize(int size);
+    int GetRotatingBufferSize() const;
+    TuningResultsManager& GetTuningResultsManager();
+    TuningResultsValidator& GetTuningResultsValidator();
+    TuningResults GetTuningResults();
+    TuningStatus LoadTuningResults(const TuningResults& tr);
+    void SetFilename(const std::string& filename, bool insert_device_ordinal=false);
+    std::string GetFilename() const;
+    bool ReadFile(const std::string& filename={});
+    template<class... Types>
+    void Log(int level, Types... args) {
+      if (GetLogOkay() && GetLogLevel() >= level) {
+        GetLog() << c10::str(args...) << std::endl;
+      }
+    }
+  private:
+    std::string GetLogFilename() const;
+    int GetLogLevel() const;
+    bool GetLogOkay() const;
+    std::ostream& GetLog() const;
+    bool enable_;
+    bool tuning_enable_;
+    bool record_untuned_enable_;
+    bool manager_initialized_;
+    bool numerics_check_enable_;
+    int max_tuning_duration_ms_;
+    int max_tuning_iterations_;
+    int max_warmup_duration_ms_;
+    int max_warmup_iterations_;
+    bool icache_flush_;
+    int rotating_buffer_size_;
+    mutable TuningResultsManager manager_;
+    mutable c10::once_flag manager_init_once_;
+    TuningResultsValidator validator_;
+    std::string filename_;
+    std::ofstream untuned_file_;
+    size_t results_count_from_input_file_;
+    bool is_shutting_down_;
+    NumericalCheckConfig numerics_cfg_{};
+};
+TORCH_CUDA_CPP_API TuningContext* getTuningContext();
+class ITimer {
+  public:
+    ITimer() = default;
+    virtual ~ITimer() = default;
+    virtual void Start() = 0;
+    virtual void End() = 0;
+    /// Computes the elapsed time in milliseconds between Start() and End()
+    virtual float Duration() = 0;
+};
+} // namespace at::cuda::tunable
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h ADDED Viewed

	@@ -0,0 +1,334 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <ATen/cuda/tunable/GemmCommon.h>
+#ifdef USE_ROCM
+#include <ATen/cuda/tunable/GemmHipblaslt.h>
+#include <ATen/cuda/tunable/GemmRocblas.h>
+#endif
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Float8_e8m0fnu.h>
+#include <c10/util/StringUtil.h>
+#include <fmt/printf.h>
+namespace at::cuda::tunable {
+template <typename T>
+class DefaultGemmOp : public Callable<GemmParams<T>> {
+  public:
+    TuningStatus Call(const GemmParams<T>* params) override {
+      at::cuda::blas::gemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->beta,
+          params->c, params->ldc);
+      return OK;
+    }
+};
+static bool _transposeBoolFromChar(char op) {
+  return op == 't' || op == 'T';
+}
+template <typename T>
+class DefaultGemmAndBiasOp : public Callable<GemmAndBiasParams<T>> {
+  public:
+    TuningStatus Call(const GemmAndBiasParams<T>* params) override {
+      at::cuda::blas::gemm_and_bias<T>(
+          _transposeBoolFromChar(params->transa),
+          _transposeBoolFromChar(params->transb),
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->bias,
+          params->c, params->ldc,
+          params->activation);
+      return OK;
+    }
+};
+template <typename T>
+class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      at::cuda::blas::bgemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda, params->stride_a,
+          params->b, params->ldb, params->stride_b,
+          params->beta,
+          params->c, params->ldc, params->stride_c,
+          params->batch);
+      return OK;
+    }
+};
+template <typename T>
+class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+  public:
+    TuningStatus Call(const ScaledGemmParams<T>* params) override {
+      at::cuda::blas::scaled_gemm(
+          params->transa,
+          params->transb,
+          params->m,
+          params->n,
+          params->k,
+          params->a,
+          params->a_scale_ptr,
+          params->lda,
+          params->a_dtype,
+          params->a_scale_dtype,
+          params->a_scaling_type,
+          params->b,
+          params->b_scale_ptr,
+          params->ldb,
+          params->b_dtype,
+          params->b_scale_dtype,
+          params->b_scaling_type,
+          params->bias_ptr,
+          params->bias_dtype,
+          params->c,
+          params->c_scale_ptr,
+          params->ldc,
+          params->c_dtype,
+          params->use_fast_accum,
+          std::nullopt /* alpha */);
+      return OK;
+    }
+};
+template <typename T>
+inline bool IsZero(T v) {
+  return v == 0.0f;
+}
+template <>
+inline bool IsZero(BFloat16 v) {
+  return v.x == 0;
+}
+template <>
+inline bool IsZero(Half v) {
+  return float(v) == 0.0f;
+}
+template <>
+inline bool IsZero(c10::complex<double> v) {
+  return v == 0.0;
+}
+template <>
+inline bool IsZero(c10::complex<float> v) {
+  return v == 0.0f;
+}
+template <typename T>
+inline const char* TypeName(T v) {
+  return "unknown";
+}
+template <>
+inline const char* TypeName(float v) {
+  if (at::globalContext().allowTF32CuBLAS()) {
+    return "tf32";
+  } else {
+    return "float";
+  }
+}
+template <>
+inline const char* TypeName(double v) {
+  return "double";
+}
+template <>
+inline const char* TypeName(BFloat16 v) {
+  return "BFloat16";
+}
+template <>
+inline const char* TypeName(Half v) {
+  return "Half";
+}
+template <>
+inline const char* TypeName(Float8_e4m3fn v) {
+  return "Float8_e4m3fn";
+}
+template <>
+inline const char* TypeName(Float8_e5m2 v) {
+  return "Float8_e5m2";
+}
+template <>
+inline const char* TypeName(Float8_e4m3fnuz v) {
+  return "Float8_e4m3fnuz";
+}
+template <>
+inline const char* TypeName(Float8_e5m2fnuz v) {
+  return "Float8_e5m2fnuz";
+}
+template <>
+inline const char* TypeName(Float8_e8m0fnu v) {
+  return "Float8_e8m0fnu";
+}
+template <>
+inline const char* TypeName(c10::complex<double> v) {
+  return "c10::complex<double>";
+}
+template <>
+inline const char* TypeName(c10::complex<float> v) {
+  return "c10::complex<float>";
+}
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmTunableOp : public TunableOp<GemmParams<T>> {
+ public:
+  GemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmOp<T>>());
+#ifdef USE_ROCM
+    static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
+    if (!env_rocblas.has_value() || env_rocblas.value()) {
+      for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps<T>()) {
+        this->RegisterOp(std::move(name), std::move(op));
+      }
+    }
+    static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (!env_hipblaslt.has_value() || env_hipblaslt.value()) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+    }
+#endif
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmOp<T>>());
+  }
+  std::string Signature() override {
+    return fmt::sprintf("GemmTunableOp_%s_%c%c", TypeName<T>(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmAndBiasTunableOp : public TunableOp<GemmAndBiasParams<T>> {
+ public:
+  GemmAndBiasTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmAndBiasOp<T>>());
+#ifdef USE_ROCM
+    static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (!env_hipblaslt.has_value() || env_hipblaslt.value()) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmAndBiasTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+    }
+#endif
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmAndBiasOp<T>>());
+  }
+  std::string Signature() override {
+    return fmt::sprintf("GemmAndBiasTunableOp_%s_%c%c", TypeName<T>(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>> {
+ public:
+  GemmStridedBatchedTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmStridedBatchedOp<T>>());
+#ifdef USE_ROCM
+    static const auto env_rocblas = c10::utils::check_env("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
+    if (!env_rocblas.has_value() || env_rocblas.value()) {
+      for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps<T>()) {
+        this->RegisterOp(std::move(name), std::move(op));
+      }
+    }
+    static const auto env_hipblaslt = c10::utils::check_env("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (!env_hipblaslt.has_value() || env_hipblaslt.value()) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmStridedBatchedTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+    }
+#endif
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmStridedBatchedOp<T>>());
+  }
+  std::string Signature() override {
+    return fmt::sprintf("GemmStridedBatchedTunableOp_%s_%c%c", TypeName<T>(T{}), BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>> {
+ public:
+  ScaledGemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultScaledGemmOp<CT>>());
+#ifdef USE_ROCM
+    for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+#endif
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultScaledGemmOp<CT>>());
+  }
+  std::string Signature() override {
+    return fmt::sprintf("ScaledGemmTunableOp_%s_%s_%s_%c%c",
+      TypeName<AT>(AT{}),
+      TypeName<BT>(BT{}),
+      TypeName<CT>(CT{}),
+      BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+} // namespace at::cuda::tunable
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h ADDED Viewed

	@@ -0,0 +1,434 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <ATen/cuda/tunable/Tunable.h>
+#include <ATen/cuda/tunable/StreamTimer.h>
+#include <ATen/cuda/Sleep.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#ifndef _WIN32
+#include <cxxabi.h>
+#endif
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <deque>
+namespace at::cuda::tunable {
+template <typename ParamsT>
+class Callable {
+  public:
+    virtual ~Callable() = default;
+    virtual TuningStatus Call(const ParamsT* /*unused*/) {
+      return FAIL;
+    }
+    virtual TuningStatus IsSupported(const ParamsT* params) {
+      return Call(params);
+    }
+};
+namespace {
+/** http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance */
+class Stats {
+  public:
+    Stats() {
+      _n = 0UL;
+      _mean = 0.0;
+      _M2 = 0.0;
+      _sum = 0.0;
+      _min = 0.0;
+      _max = 0.0;
+    }
+    void sample_value(const double x) {
+      double delta = 0;
+      _sum = _sum + x;
+      if (0UL == _n) {
+          _min = x;
+          _max = x;
+      }
+      else {
+          _min = _min < x ? _min : x;
+          _max = _max > x ? _max : x;
+      }
+      _n = _n + 1UL;
+      delta = x - _mean;
+      _mean = _mean + delta/_n;
+      _M2 = _M2 + delta * (x - _mean);
+    }
+    double variance() const {
+      return _M2/(_n-1);
+    }
+    double stddev() const {
+      return std::sqrt(variance());
+    }
+    unsigned long _n;
+    double _mean;
+    double _M2;
+    double _sum;
+    double _min;
+    double _max;
+};
+class FixedSizeStack {
+  private:
+      std::deque<std::string> stack;
+      const size_t max_size;
+  public:
+      FixedSizeStack(size_t size) : max_size(size) {}
+      void push(const std::string& value) {
+          if (stack.size() >= max_size) {
+              stack.pop_front(); // Remove the oldest entry
+          }
+          stack.push_back(value); // Add new entry
+      }
+      auto rbegin() { return stack.rbegin(); }
+      auto rend() { return stack.rend(); }
+};
+} // anonymous namespace
+template <typename ParamsT>
+class TunableOp {
+  public:
+    virtual ~TunableOp() = default;
+    TuningStatus operator()(const ParamsT* params) {
+      ResultEntry result = ResultEntry::Null();
+      TuningContext* ctx = getTuningContext();
+      if (ctx->IsTunableOpEnabled()) {
+        auto& mgr = ctx->GetTuningResultsManager();
+        auto op_sig = Signature();
+        auto params_sig = params->Signature();
+        auto blas_sig = params->BLASSignature();
+        result = mgr.Lookup(op_sig, params_sig);
+        // If there is not previous tuning result been found, we do the tuning iff tuning is enabled
+        if (result == ResultEntry::Null()) {
+          if (ctx->IsTuningEnabled()) {
+            result = FindFastest(params);
+            mgr.Add(op_sig, params_sig, result);
+          }
+          else if (ctx->IsRecordUntunedEnabled()) {
+            // or record the gemm into file
+            mgr.RecordUntuned(ctx->GetUntunedFile(), op_sig, params_sig, blas_sig);
+          }
+        }
+      }
+      else {
+        result = ResultEntry::Default();
+      }
+      if (result == ResultEntry::Null()) {
+        TUNABLE_LOG2("no result, using default");
+        result = ResultEntry::Default();
+      }
+      auto iter = ops_.find(result);
+      TORCH_CHECK(iter != ops_.end());
+      return iter->second->Call(params);
+    }
+    virtual std::string Signature() {
+      // According to C++17 standard https://wg21.link/n4659 section 15.7.4
+      // > if the operand of typeid refers to the
+      // > object under construction or destruction, typeid yields the std::type_info object representing the constructor
+      // > or destructor’s class.
+      // So delay the op signature generation.
+      c10::call_once(signature_init_once_, [this]() { signature_ = CreateSignature(); });
+      return signature_;
+    }
+  protected:
+    void RegisterOp(const std::string& name, std::unique_ptr<Callable<ParamsT>> op) {
+      this->op_names_.emplace_back(name);
+      this->ops_.emplace(name, std::move(op));
+    }
+  private:
+    static void WarmUp(Callable<ParamsT> *op, const std::vector<ParamsT*> &param, size_t num_iter, size_t &offset) {
+      TuningContext* ctx = getTuningContext();
+      bool do_flush = ctx->IsICacheFlushEnabled();
+      for (size_t i = 0; i < num_iter; i++) {
+        if (do_flush) {
+          at::cuda::flush_icache();
+        }
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+    }
+    static double ProfileSimple(Callable<ParamsT> *op, const std::vector<ParamsT*> &param, size_t num_iter, size_t &offset) {
+      TuningContext* ctx = getTuningContext();
+      bool do_flush = ctx->IsICacheFlushEnabled();
+      StreamTimerNoSync timer{};
+      // Small Mandatory Warmup
+      // Reduces outliers
+      for (size_t i = 0; i < 2; i++) {
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+      timer.Start();
+      for (size_t i = 0; i < num_iter; i++) {
+        if (do_flush) {
+          at::cuda::flush_icache();
+        }
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+      timer.End();
+      return timer.Duration() / num_iter;
+    }
+    static Stats ProfileStats(Callable<ParamsT> *op, const std::vector<ParamsT*> &param, size_t num_iter, size_t &offset) {
+      TuningContext* ctx = getTuningContext();
+      bool do_flush = ctx->IsICacheFlushEnabled();
+      std::vector<StreamTimerNoSync> timer(num_iter);
+      // Small Mandatory Warmup
+      // Reduces outliers
+      for (size_t i = 0; i < 2; i++) {
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+      for (size_t i = 0; i < num_iter; i++) {
+        timer[i].Start();
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+        timer[i].End();
+        if (do_flush) {
+          at::cuda::flush_icache();
+        }
+      }
+      Stats s;
+      for (size_t i = 0; i < num_iter; i++) {
+        s.sample_value(timer[i].Duration());
+      }
+      return s;
+    }
+  protected:
+    virtual ResultEntry FindFastest(const ParamsT* params) {
+      TuningContext* ctx = getTuningContext();
+      auto op_sig = Signature();
+      auto params_sig = params->Signature();
+      auto blas_sig = params->BLASSignature();
+      TUNABLE_LOG2("finding fastest for ", op_sig, '(', params_sig, ')', " out of ", op_names_.size(), " candidates");
+      auto min_duration_ms = std::numeric_limits<double>::infinity();
+      std::string id_name = "Default";
+      ParamsT* reference_params = nullptr;
+      auto top_solns = FixedSizeStack(5);
+      // numeric check option is controlled by non-static env var, so check it once per tuned operator
+      bool do_numerics_check = ctx->IsNumericsCheckEnabled();
+      // calculate a reference answer for numerical check
+      if (do_numerics_check) {
+        reference_params = params->DeepCopy(false);
+        TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
+      }
+      // need copies of params to reuse
+      // make as many copies as will fill the requested rotating buffer size, if requested
+      // rotating_size guaranteed to be >= 0 even though GetRotatingBufferSize() returns int
+      size_t rotating_size = ctx->GetRotatingBufferSize();
+      bool use_buffer_rotation = (rotating_size > 0);
+      size_t param_size = params->GetSize(use_buffer_rotation);
+      size_t param_count = (rotating_size / param_size) + 1;
+      constexpr size_t MB = 1024ull*1024;
+      if (use_buffer_rotation) {
+        TUNABLE_LOG2("Rotating buffer ", rotating_size/MB, " MiB. ",
+            "Needed Size: ", param_size/MB, " MiB. ",
+            "Needed number of param copies: ", param_count);
+      }
+      TORCH_CHECK(param_count > 0);
+      std::vector<ParamsT*> reusable_params(param_count);
+      for (size_t i = 0; i < param_count; i++) {
+        reusable_params[i] = params->DeepCopy(use_buffer_rotation);
+      }
+      // for rotating buffer
+      size_t offset = 0;
+      for (size_t i = 0; i < op_names_.size(); i++) {
+        auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
+        auto status = candidate->Call(reusable_params[0]);
+        if (status != OK) {
+          TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+        // collect a small profile
+        int approx_num_iter = 3;
+        auto s = ProfileStats(candidate, reusable_params, approx_num_iter, offset);
+        double approx_duration = s._mean;
+        // bail if too slow
+        if (approx_duration > 1.5 * min_duration_ms) {
+          TUNABLE_LOG3("├──skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+        // 2nd phase skip, more aggressive
+        approx_num_iter = 10;
+        s = ProfileStats(candidate, reusable_params, approx_num_iter, offset);
+        approx_duration = s._mean;
+        // bail if too slow
+        if (approx_duration > 1.15 * min_duration_ms) {
+          TUNABLE_LOG3("├──2nd skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+        if (do_numerics_check) {
+          ParamsT* numerical_params = params->DeepCopy(false);
+          auto status = candidate->Call(numerical_params);
+          if (status != OK) {
+            numerical_params->Delete();
+            TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+          status = reference_params->NumericalCheck(numerical_params);
+          numerical_params->Delete();
+          if (status != OK) {
+            TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+        }
+        // for warmup does user set max duration, max iters, or both?
+        // warmup is skipped by default, i.e. warmup_iter = 0
+        // warmup will be set to the non-zero value of max_warmup_duration
+        // or max_warmup_iter
+        // if both are non-zero, we take the smaller of the two.
+        double max_warmup_duration = ctx->GetMaxWarmupDurationMs();
+        int max_warmup_iter = ctx->GetMaxWarmupIterations();
+        int warmup_iter = 0; // default
+        if (max_warmup_duration > 0) {
+          int duration_iters = max_warmup_duration / approx_duration;
+          if (max_warmup_iter > 0) {
+            warmup_iter = std::min(max_warmup_iter, duration_iters);
+          }
+          else {
+            warmup_iter = duration_iters;
+          }
+        }
+        else if (max_warmup_iter > 0) {
+          warmup_iter = max_warmup_iter;
+        }
+        // for tuning does user set max duration, max iters, or both?
+        double max_tuning_duration = ctx->GetMaxTuningDurationMs();
+        int max_tuning_iter = ctx->GetMaxTuningIterations();
+        int tuning_iter = 100; // default
+        if (max_tuning_duration > 0) {
+          int duration_iters = max_tuning_duration / approx_duration;
+          if (max_tuning_iter > 0) {
+            tuning_iter = std::min(max_tuning_iter, duration_iters);
+          }
+          else {
+            tuning_iter = duration_iters;
+          }
+        }
+        else if (max_tuning_iter > 0) {
+          tuning_iter = max_tuning_iter;
+        }
+        // tuning must run at least 1 iteration
+        tuning_iter = std::max(1, tuning_iter);
+        // do the full warmup followed by tuning
+        double warmup_ms = warmup_iter * approx_duration;
+        double tuning_ms = tuning_iter * approx_duration;
+        TUNABLE_LOG3("├──tuning using "
+            "warmup iters ", warmup_iter, " [", warmup_ms, " ms] "
+            "and tuning iters ", tuning_iter, " [", tuning_ms, " ms] ",
+            "instance id=", i, ", ", op_sig, "(", params_sig, ") ", op_names_[i]);
+        TUNABLE_LOG3("├──offset at ", offset);
+        WarmUp(candidate, reusable_params, warmup_iter, offset);
+        s = ProfileStats(candidate, reusable_params, tuning_iter, offset);
+        auto s_stddev = s.stddev();
+        // Assume normal distribution.
+        // Solution with smallest mean + 2*sigma will be a better solution?
+        // if ((s._mean + 2*s_stddev) < (min_duration_ms + 2*min_stddev_ms)) {
+        if (s._mean < min_duration_ms) {
+          TUNABLE_LOG3("├──found better instance id=", i, ". " , s._mean, "ms. ", op_names_[i],
+                " min ", s._min,
+                " max ", s._max,
+                " mean ", s._mean,
+                " std ", s_stddev);
+          min_duration_ms = s._mean;
+          id_name = op_names_[i];
+          std::string current_soln = std::to_string(s._mean) + " " + op_names_[i];
+          top_solns.push(current_soln);
+        }
+        else {
+          TUNABLE_LOG3("├──found slower instance id=", i, ". " , s._mean, "ms. ", op_names_[i],
+                " min ", s._min,
+                " max ", s._max,
+                " mean ", s._mean,
+                " std ", s_stddev);
+        }
+      }
+      for (size_t i = 0; i < reusable_params.size(); i++) {
+        reusable_params[i]->Delete();
+      }
+      if (reference_params) {
+        reference_params->Delete();
+      }
+      TUNABLE_LOG2("└──found fastest for ", op_sig, '(', params_sig, ") ", id_name);
+      TUNABLE_LOG2("└──top five solutions for ", op_sig, '(', params_sig, ") ");
+      for (auto it = top_solns.rbegin(); it != top_solns.rend(); ++it) {
+        TUNABLE_LOG2("   ", *it);
+      }
+      return ResultEntry(id_name, min_duration_ms, blas_sig);
+    }
+  private:
+    std::string CreateSignature() {
+#ifndef _WIN32
+      const auto* name = typeid(*this).name();
+      // NOLINTNEXTLINE(*array*)
+      char buf[256];
+      size_t buf_len = 256;
+      abi::__cxa_demangle(name, buf, &buf_len, nullptr);
+      buf[255] = '\0';
+      return buf;
+#else
+      return typeid(*this).name();
+#endif
+    }
+    mutable c10::once_flag signature_init_once_;
+    std::string signature_;
+    std::unordered_map<std::string, std::unique_ptr<Callable<ParamsT>>> ops_;
+    std::vector<std::string> op_names_;
+};
+struct OpParams {
+  virtual ~OpParams() = default;
+  virtual std::string Signature() const = 0;
+  virtual std::string BLASSignature() const = 0;
+};
+} // namespace at::cuda::tunable
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/ADInterpreters.h ADDED Viewed

	@@ -0,0 +1,43 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/functorch/Interpreter.h>
+namespace at::functorch {
+// These are the interpreters for our AD transforms
+// (grad, vjp and jvp).
+// See NOTE: [functorch interpreter stack] for more details.
+struct TORCH_API GradInterpreterPtr {
+  explicit GradInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Grad); }
+  TransformType key() const { return base_->key(); }
+  int64_t level() const { return base_->level(); }
+  void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+  void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case);
+  bool prevGradMode() const {
+    return std::get<GradInterpreterMeta>(base_->meta()).prevGradMode_;
+  }
+  Tensor lift(const Tensor& tensor) const;
+ private:
+  const Interpreter* base_;
+};
+struct TORCH_API JvpInterpreterPtr {
+  explicit JvpInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Jvp); }
+  TransformType key() const { return base_->key(); }
+  int64_t level() const { return base_->level(); }
+  void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+  void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case);
+  bool prevFwdGradMode() const {
+    return std::get<JvpInterpreterMeta>(base_->meta()).prevFwdGradMode_;
+  }
+  Tensor lift(const Tensor& tensor) const;
+ private:
+  const Interpreter* base_;
+};
+} // namespace at::functorch
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchRulesHelper.h ADDED Viewed

	@@ -0,0 +1,486 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <c10/util/TypeList.h>
+#include <ATen/ATen.h>
+#include <ATen/Operators.h>
+#include <ATen/functorch/DynamicLayer.h>
+#include <ATen/functorch/TensorWrapper.h>
+#include <ATen/functorch/BatchingMetaprogramming.h>
+#include <ATen/functorch/LegacyVmapTransforms.h>
+#include <ATen/functorch/BatchedFallback.h>
+#include <ATen/functorch/PlumbingHelper.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/VmapGeneratedPlumbing.h>
+#include <utility>
+// This file contains helper functions for batching rules.
+namespace at::functorch {
+TORCH_API Tensor reshape_dim_into(int64_t src, int64_t dst, const Tensor& x);
+TORCH_API Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x);
+TORCH_API Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x);
+Tensor moveBatchDimToFront(Tensor tensor, std::optional<int64_t> maybe_batch_dim);
+int64_t rankWithoutBatchDim(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
+int64_t numelWithoutBatchDim(const Tensor& tensor, std::optional<int64_t> maybe_batch_dim);
+std::optional<int64_t> valIfNonempty(std::optional<int64_t> maybe_empty, int64_t new_val);
+int64_t getPhysicalDim(const Tensor& tensor, bool has_batch_dim, int64_t logical_dim);
+VmapDimVector getPhysicalDims(const Tensor& tensor, bool has_batch_dim, IntArrayRef logical_dims);
+void vmapIncompatibleInplaceError(const char* schema_name);
+Tensor maybePadToLogicalRank(const Tensor& tensor, std::optional<int64_t> has_bdim, int64_t logical_rank);
+void check_randomness(RandomnessType randomness);
+void check_randomness(RandomnessType randomness, bool any_tensor_bdim);
+inline Tensor ensure_has_bdim(const Tensor& tensor, bool has_bdim, c10::SymInt batch_size) {
+  if (has_bdim) {
+    return tensor;
+  }
+  const auto sizes = tensor.sym_sizes();
+  SymDimVector expanded_shape;
+  expanded_shape.reserve(sizes.size());
+  expanded_shape.emplace_back(std::move(batch_size));
+  expanded_shape.insert(expanded_shape.end(), sizes.begin(), sizes.end());
+  return tensor.expand_symint(expanded_shape);
+}
+#define VMAP_SUPPORT(op, batch_rule) \
+  m.impl(#op, op ## _generated_plumbing<decltype(&batch_rule), &batch_rule>);
+#define VMAP_SUPPORT2(op, overload, batch_rule) \
+  m.impl(#op "." #overload, op ## _ ## overload ## _generated_plumbing<decltype(&batch_rule), &batch_rule>);
+#define OP_DECOMPOSE(op)  m.impl(#op, static_cast<decltype(&ATEN_FN(op))>(native::op));
+#define OP_DECOMPOSE2(op, overload)  m.impl(#op"."#overload, static_cast<decltype(&ATEN_FN2(op, overload))>(native::op));
+// DO NOT USE ME DIRECTLY! Use BASIC_UNARY_BATCH_RULE to save yourself some pain
+template <typename A, A a, typename C>
+struct BasicUnaryBatchRuleHelper;
+template <typename F, F Func, typename A, typename... T>
+struct BasicUnaryBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
+      const Tensor& tensor,
+      std::optional<int64_t> batch_dim,
+      T... extra_args) {
+    return std::make_tuple(Func(tensor, std::forward<T>(extra_args)...), batch_dim);
+  }
+};
+// USAGE: BASIC_UNARY_BATCH_RULE(at::sin)
+// INCORRECT USAGE: BASIC_UNARY_BATCH_RULE(&at::sin)
+// It is important that this macro is not passed a function pointer!!
+#define BASIC_UNARY_BATCH_RULE(fn) SINGLE_ARG(\
+    BasicUnaryBatchRuleHelper<\
+      decltype(&fn),\
+      &fn,\
+      c10::guts::function_traits<decltype(fn)>::parameter_types>::apply)
+#define UNARY_POINTWISE(op) \
+  VMAP_SUPPORT(op, BASIC_UNARY_BATCH_RULE(ATEN_FN(op)));
+template <typename A, A a, typename C>
+struct VariadicBdimsBatchRuleHelper;
+template <typename F, F Func, typename A, typename... T>
+struct VariadicBdimsBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
+      const Tensor& tensor,
+      std::optional<int64_t> batch_dim,
+      T... extra_args) {
+    auto tensor_ = moveBatchDimToFront(tensor, batch_dim);
+    return std::make_tuple(Func(tensor_, std::forward<T>(extra_args)...), 0);
+  }
+};
+// USAGE: VARIADIC_BDIMS_BATCH_RULE(at::cholesky_inverse)
+// INCORRECT USAGE: VARIADIC_BDIMS_BATCH_RULE(&at::cholesky_inverse)
+// It is important that this macro is not passed a function pointer!!
+#define VARIADIC_BDIMS_BATCH_RULE(fn) SINGLE_ARG(\
+    VariadicBdimsBatchRuleHelper<\
+      decltype(&fn),\
+      &fn,\
+      c10::guts::function_traits<decltype(fn)>::parameter_types>::apply)
+#define VARIADIC_BDIMS(op) \
+  VMAP_SUPPORT(op, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN(op)));
+#define VARIADIC_BDIMS2(op, overload) \
+  VMAP_SUPPORT2(op, overload, VARIADIC_BDIMS_BATCH_RULE(ATEN_FN2(op, overload)));
+template<class F, F Func>
+void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+  auto maybe_layer = maybeCurrentDynamicLayer();
+  vmap_check_escaped(maybe_layer, "boxed_tensor_inputs_batch_rule");
+  int64_t cur_level = maybe_layer->layerId();
+  auto orig_arguments = torch::jit::last(*stack, num_arguments);
+  if (std::none_of(orig_arguments.begin(), orig_arguments.end(), ivalueParticipatesInCurrentLevel)) {
+    op.callBoxed(stack);
+    return;
+  }
+  auto arguments = torch::jit::pop(*stack, num_arguments);
+  std::vector<std::pair<Tensor, std::optional<int64_t>>> tensor_inputs;
+  std::vector<int64_t> tensor_pos;
+  for (const auto idx : c10::irange(0, num_arguments)) {
+    const auto& ivalue = arguments[idx];
+    if (ivalue.isTensor()) {
+      auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
+      tensor_inputs.emplace_back(std::move(tensor_value), tensor_bdim);
+      tensor_pos.push_back(static_cast<int64_t>(idx));
+    }
+  }
+  Func(tensor_inputs);
+  size_t tensor_idx = 0;
+  TORCH_INTERNAL_ASSERT(!tensor_pos.empty());
+  for (const auto arg_idx : c10::irange(0, num_arguments)) {
+    if (tensor_idx >= tensor_pos.size() || (int64_t)arg_idx != tensor_pos[tensor_idx]) {
+      torch::jit::push(stack, arguments[arg_idx]);
+    } else {
+      TORCH_INTERNAL_ASSERT(tensor_idx < tensor_inputs.size());
+      torch::jit::push(stack, tensor_inputs[tensor_idx].first);
+      tensor_idx++;
+    }
+  }
+  op.callBoxed(stack);
+  const auto returns = torch::jit::pop(*stack, num_returns);
+  for (const auto& ret : returns) {
+    if (ret.isTensor()) {
+      torch::jit::push(stack, makeBatched(ret.toTensor(), 0, cur_level));
+    } else {
+      TORCH_INTERNAL_ASSERT(false, "This boxed batching rule does not currently support ops that return non-tensor values");
+    }
+  }
+}
+inline void handle_pointwise_ops(std::vector<std::pair<Tensor, std::optional<int64_t>>> &tensor_inputs) {
+  int64_t out_logical_rank = 0;
+  for (auto& tensor_input : tensor_inputs) {
+    int64_t cur_logical_rank = rankWithoutBatchDim(tensor_input.first, tensor_input.second);
+    out_logical_rank = std::max(out_logical_rank, cur_logical_rank);
+  }
+  for (auto& tensor_input: tensor_inputs) {
+    tensor_input.first = moveBatchDimToFront(tensor_input.first, tensor_input.second);
+    tensor_input.first = maybePadToLogicalRank(tensor_input.first, tensor_input.second, out_logical_rank);
+  }
+}
+#define POINTWISE_BOXED(op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_pointwise_ops), &handle_pointwise_ops>>());
+#define POINTWISE_BOXED2(op, overload) \
+  m.impl(#op "." #overload, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_pointwise_ops), &handle_pointwise_ops>>());
+inline void handle_variadic_bdims(std::vector<std::pair<Tensor, std::optional<int64_t>>> &tensor_inputs) {
+  for (auto & tensor_input : tensor_inputs) {
+    tensor_input.first = moveBatchDimToFront(tensor_input.first, tensor_input.second);
+  }
+}
+#define VARIADIC_BDIMS_BOXED(op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_tensor_inputs_batch_rule<decltype(&handle_variadic_bdims), &handle_variadic_bdims>>());
+using UnpackedBatchedTensor = std::tuple<Tensor, std::optional<int64_t>>;
+inline void find_and_unpack_tensors(
+    const torch::jit::Stack* stack,
+    int64_t num_args,
+    int64_t cur_level,
+    SmallVector<UnpackedBatchedTensor, 5>* tensors,
+    SmallVector<int64_t, 5>* tensors_pos,
+    int64_t* batch_size) {
+  int64_t computed_batch_size = -1;
+  int64_t args_begin = static_cast<int64_t>(stack->size()) - num_args;
+  for (const auto idx : c10::irange(0, num_args)) {
+    const auto& ivalue = (*stack)[args_begin + idx];
+    if (!ivalue.isTensor()) {
+      continue;
+    }
+    auto unpacked = unwrapTensorAtLevel(ivalue.toTensor(), cur_level);
+    const auto& [tensor_value, tensor_bdim] = unpacked;
+    if (tensor_bdim.has_value()) {
+      auto candidate_batch_size = tensor_value.size(*tensor_bdim);
+      if (computed_batch_size == -1) {
+        computed_batch_size = candidate_batch_size;
+      }
+      TORCH_INTERNAL_ASSERT(candidate_batch_size == computed_batch_size);
+    }
+    tensors->push_back(std::move(unpacked));
+    tensors_pos->push_back(idx);
+  }
+  TORCH_INTERNAL_ASSERT(computed_batch_size > -1);
+  *batch_size = computed_batch_size;
+}
+inline void boxed_existing_bdim_all_batch_rule(
+    const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = static_cast<int64_t>(schema.arguments().size());
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+  const auto maybe_layer = maybeCurrentDynamicLayer();
+  vmap_check_escaped(maybe_layer, "boxed_existing_bdim_all_batch_rule");
+  const auto arguments = torch::jit::last(stack, num_arguments);
+  if (std::none_of(arguments.begin(), arguments.end(), ivalueParticipatesInCurrentLevel)) {
+    op.callBoxed(stack);
+    return;
+  }
+  int64_t args_begin = static_cast<int64_t>(stack->size()) - num_arguments;
+  SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
+  SmallVector<int64_t, 5> tensor_pos;
+  int64_t batch_size = 0;
+  // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+  int64_t cur_level = maybe_layer->layerId();
+  find_and_unpack_tensors(
+      stack, num_arguments, cur_level,
+      &tensor_inputs, &tensor_pos, &batch_size);
+  // for each tensor, ensure it has a bdim and reshape it.
+  for (const auto tensor_idx : c10::irange(0, tensor_inputs.size())) {
+    const auto& [value, bdim] = tensor_inputs[tensor_idx];
+    auto value_ = ensure_has_bdim(value, bdim.has_value(), batch_size);
+    (*stack)[args_begin + tensor_pos[tensor_idx]] = reshape_dim_into(bdim.value_or(0), 0, value_);
+  }
+  op.callBoxed(stack);
+  for (const auto idx : c10::irange(args_begin, args_begin + num_returns)) {
+    const auto& ret = (*stack)[idx];
+    TORCH_INTERNAL_ASSERT(ret.isTensor(),
+        "This boxed batching rule does not currently support ops that return non-tensor values");
+    (*stack)[idx] = makeBatched(reshape_dim_outof(0, batch_size, ret.toTensor()), 0, cur_level);
+  }
+}
+// Use when all tensors arguments accept one (normal) batch dim.
+// This batching rule expands the batch dim on all Tensors, reshapes it into
+// dim 0, calls the op, and then reshapes the batch dim out of dim 0.
+// This is not the most efficient thing; if there are alternatives, please try
+// to use them. Use this only as a last resort.
+#define EXISTING_BDIM_ALL_BOXED(op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
+template <int64_t feature_rank, int64_t contig_tensor_index=-1>
+inline void boxed_all_tensors_have_optional_bdim(
+    const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+  const auto& schema = op.schema();
+  const auto num_returns = schema.returns().size();
+  const auto num_arguments = schema.arguments().size();
+  c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched);
+  auto maybe_layer = maybeCurrentDynamicLayer();
+  vmap_check_escaped(maybe_layer, "boxed_all_tensors_have_optional_bdim");
+  int64_t cur_level = maybe_layer->layerId();
+  const auto arguments = torch::jit::last(stack, num_arguments);
+  if (std::none_of(arguments.begin(), arguments.end(), ivalueParticipatesInCurrentLevel)) {
+    op.callBoxed(stack);
+    return;
+  }
+  int64_t args_begin = static_cast<int64_t>(stack->size() - num_arguments);
+  SmallVector<UnpackedBatchedTensor, 5> tensor_inputs;
+  SmallVector<int64_t, 5> tensor_pos;
+  int64_t batch_size = 0;
+  find_and_unpack_tensors(
+      stack, static_cast<int64_t>(num_arguments), cur_level,
+      &tensor_inputs, &tensor_pos, &batch_size);
+  std::optional<bool> is_no_batch_dim_case;
+  for (const auto tensor_idx : c10::irange(0, tensor_inputs.size())) {
+    const auto& value = std::get<0>(tensor_inputs[tensor_idx]);
+    auto bdim = std::get<1>(tensor_inputs[tensor_idx]);
+    const auto logical_rank = rankWithoutBatchDim(value, bdim);
+    if (!is_no_batch_dim_case.has_value()) {
+      is_no_batch_dim_case = (logical_rank == feature_rank);
+    }
+    auto value_ = ensure_has_bdim(value, bdim.has_value(), batch_size);
+    if (!bdim.has_value()) {
+      bdim = 0;
+    }
+    if (*is_no_batch_dim_case) {
+      TORCH_INTERNAL_ASSERT(logical_rank == feature_rank);
+      value_ = moveBatchDimToFront(value_, bdim);
+      if (tensor_idx == contig_tensor_index) {
+        value_ = value_.contiguous();
+      }
+      (*stack)[args_begin + tensor_pos[tensor_idx]] = std::move(value_);
+      continue;
+    }
+    TORCH_INTERNAL_ASSERT(logical_rank == feature_rank + 1);
+    value_ = reshape_dim_into(*bdim, 0, value_);
+    if (tensor_idx == contig_tensor_index) {
+      value_ = value_.contiguous();
+    }
+    (*stack)[args_begin + tensor_pos[tensor_idx]] = std::move(value_);
+  }
+  op.callBoxed(stack);
+  for (const auto idx : c10::irange(args_begin, args_begin + num_returns)) {
+    const auto& ret = (*stack)[idx];
+    TORCH_INTERNAL_ASSERT(ret.isTensor(),
+        "This boxed batching rule does not currently support ops that return non-tensor values");
+    if (*is_no_batch_dim_case) {
+      (*stack)[idx] = makeBatched(ret.toTensor(), 0, cur_level);
+    } else {
+      (*stack)[idx] = makeBatched(reshape_dim_outof(0, batch_size, ret.toTensor()), 0, cur_level);
+    }
+  }
+}
+// Useful for many NN operators.
+// The operator must satisfy the following:
+// - All arguments must accept an optional batch dim.
+// - All arguments must be the same rank
+#define ALL_TENSORS_HAVE_OPTIONAL_BDIM_BOXED(feature_rank, op) \
+  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_all_tensors_have_optional_bdim<feature_rank>>());
+#define ALL_TENSORS_HAVE_OPTIONAL_BDIM_BOXED_CONTIG1(feature_rank, op, contig_tensor_index) \
+  m.impl(#op, \
+         torch::CppFunction::makeFromBoxedFunction<\
+             boxed_all_tensors_have_optional_bdim<\
+                 feature_rank, \
+                 contig_tensor_index>\
+             >());
+template <typename A, A a, typename C>
+struct ExistingBdimBatchRuleHelper;
+template <typename F, F Func, typename A, typename... T>
+struct ExistingBdimBatchRuleHelper<F, Func, c10::guts::typelist::typelist<A, T...>> {
+  static std::tuple<Tensor, std::optional<int64_t>> apply(
+      const Tensor& self,
+      std::optional<int64_t> self_bdim,
+      T... extra_args) {
+    auto self_ = reshape_dim_into(*self_bdim, 0, self);
+    auto out = Func(self_, std::forward<T>(extra_args)...);
+    return std::make_tuple(reshape_dim_outof_symint(0, self.sym_sizes()[*self_bdim], out), 0);
+  }
+};
+// USAGE: EXISTING_BDIM_BATCH_RULE(at::cholesky_inverse)
+// INCORRECT USAGE: EXISTING_BDIM_BATCH_RULE(&at::cholesky_inverse)
+// It is important that this macro is not passed a function pointer!!
+#define EXISTING_BDIM_BATCH_RULE(fn) SINGLE_ARG(\
+    ExistingBdimBatchRuleHelper<\
+      decltype(&fn),\
+      &fn,\
+      c10::guts::function_traits<decltype(fn)>::parameter_types>::apply)
+#define EXISTING_BDIM(op) \
+  VMAP_SUPPORT(op, EXISTING_BDIM_BATCH_RULE(ATEN_FN(op)));
+#define EXISTING_BDIM2(op, overload) \
+  VMAP_SUPPORT2(op, overload, EXISTING_BDIM_BATCH_RULE(ATEN_FN2(op, overload)));
+#define INVOKE(object,ptrToMember)  ((object).*(ptrToMember))
+template <typename F, F Method, typename... ExtraArgs>
+Tensor& unary_inplace_batch_rule(Tensor& self, std::optional<int64_t> /*unused*/, ExtraArgs... extra_args) {
+  INVOKE(self, Method)(std::forward<ExtraArgs>(extra_args)...);
+  return self;
+}
+inline int64_t get_bdim_size4(
+    const Tensor& a_value, std::optional<int64_t> a_bdim,
+    const Tensor& b_value, std::optional<int64_t> b_bdim,
+    const Tensor& c_value, std::optional<int64_t> c_bdim,
+    const Tensor& d_value, std::optional<int64_t> d_bdim) {
+  if (a_bdim)
+    return a_value.size(*a_bdim);
+  if (b_bdim)
+    return b_value.size(*b_bdim);
+  if (c_bdim)
+    return c_value.size(*c_bdim);
+  if (d_bdim)
+    return d_value.size(*d_bdim);
+  TORCH_INTERNAL_ASSERT(false);
+}
+inline int64_t get_bdim_size3(
+    const Tensor& a_value, std::optional<int64_t> a_bdim,
+    const Tensor& b_value, std::optional<int64_t> b_bdim,
+    const Tensor& c_value, std::optional<int64_t> c_bdim) {
+  if (a_bdim)
+    return a_value.size(*a_bdim);
+  if (b_bdim)
+    return b_value.size(*b_bdim);
+  if (c_bdim)
+    return c_value.size(*c_bdim);
+  TORCH_INTERNAL_ASSERT(false);
+}
+inline int64_t get_bdim_size2(
+    const Tensor& a_value, std::optional<int64_t> a_bdim,
+    const Tensor& b_value, std::optional<int64_t> b_bdim) {
+  if (a_bdim)
+    return a_value.size(*a_bdim);
+  if (b_bdim)
+    return b_value.size(*b_bdim);
+  TORCH_INTERNAL_ASSERT(false);
+}
+inline c10::SymInt get_bdim_size2_symint(
+    const Tensor& a_value, std::optional<int64_t> a_bdim,
+    const Tensor& b_value, std::optional<int64_t> b_bdim) {
+  if (a_bdim)
+    return a_value.sym_size(*a_bdim);
+  if (b_bdim)
+    return b_value.sym_size(*b_bdim);
+  TORCH_INTERNAL_ASSERT(false);
+}
+// [start, start + 1, ..., stop - 1]
+inline VmapDimVector range(int64_t start, int64_t stop) {
+  TORCH_INTERNAL_ASSERT(stop >= start);
+  VmapDimVector dims;
+  dims.reserve(stop - start);
+  for (int64_t i = start; i < stop; i++) {
+    dims.emplace_back(i);
+  }
+  return dims;
+}
+std::tuple<Tensor, Tensor> _binary_pointwise_helper(
+    const Tensor& tensor, std::optional<int64_t> tensor_batch_dim, const Tensor& other, std::optional<int64_t> other_batch_dim,
+    bool do_type_promotion=true);
+} // namespace at::functorch
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedFallback.h ADDED Viewed

	@@ -0,0 +1,86 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/library.h>
+namespace at::functorch {
+// This file contains code for the vmap fallback (also known as the
+// BatchedTensor fallback or the Batched fallback). This code runs
+// when an operation doesn't have a batching rule implemented.
+// If an operator doesn't have a batching rule implemented then we fallback
+// to this implementation. The fallback doesn't work on out= variants or
+// view operations; that is, it works for out-of-place operations and
+// in-place non-view operations.
+//
+// For out-of-place operations, the fallback effectively takes all of the
+// BatchedTensors in `stack`, slices them, and runs `op` on all of the
+// corresponding slices to produce slices of the outputs. The output slices
+// then get `torch.stack`ed to create the
+// final returns.
+//
+// The performance of the fallback is not very good because it introduces an
+// extra copy from stacking the sliced outputs. Because of this, we prefer to
+// write batching rules for operators whenever possible.
+void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+void batchedNestedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+void vmapErrorFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+// The vmap fallback emits a warning by default, but it may be disabled if
+// the user finds it to be too annoying.
+TORCH_API bool isVmapFallbackWarningEnabled();
+TORCH_API void setVmapFallbackWarningEnabled(bool enabled);
+// Used for testing. The vmap fallback is enabled by default. When it is disabled,
+// it raises an error.
+TORCH_API bool isVmapFallbackEnabled();
+TORCH_API void setVmapFallbackEnabled(bool enabled);
+template <typename A> A vector_to_result(const std::vector<IValue>& buffer) {
+  return buffer[0].to<A>();
+}
+template <typename A, typename B> std::tuple<A, B> vector_to_result(const std::vector<IValue>& buffer) {
+  return std::make_tuple(buffer[0].to<A>(), buffer[1].to<B>());
+}
+template <typename A, typename B, typename C> std::tuple<A, B, C> vector_to_result(const std::vector<IValue>& buffer) {
+  return std::make_tuple(buffer[0].to<A>(), buffer[1].to<B>(), buffer[2].to<B>());
+}
+// slow_fallback is a way to call the vmap fallback inside some boxed kernel.
+// There is probably some better way to metaprogram this.
+template <typename Ret>
+Ret slow_fallback(const c10::OperatorHandle& op, ArrayRef<IValue> args) {
+  std::vector<IValue> stack(args.begin(), args.end());
+  batchedTensorForLoopFallback(op, &stack);
+  return vector_to_result<Ret>(stack);
+}
+template <typename A, typename B>
+std::tuple<A, B> slow_fallback(const c10::OperatorHandle& op, ArrayRef<IValue> args) {
+  std::vector<IValue> stack(args.begin(), args.end());
+  batchedTensorForLoopFallback(op, &stack);
+  return vector_to_result<A, B>(stack);
+}
+template <typename A, typename B, typename C>
+std::tuple<A, B, C> slow_fallback(const c10::OperatorHandle& op, ArrayRef<IValue> args) {
+  std::vector<IValue> stack(args.begin(), args.end());
+  batchedTensorForLoopFallback(op, &stack);
+  return vector_to_result<A, B, C>(stack);
+}
+} // namespace at::functorch
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h ADDED Viewed

	@@ -0,0 +1,181 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <bitset>
+#include <ATen/ArrayRef.h>
+#include <ATen/SmallVector.h>
+#include <ATen/Tensor.h>
+namespace at::functorch {
+using Tensor = at::Tensor;
+// We assume this in a few other places in the codebase,
+// but there isn't a centralized definition.
+constexpr int64_t kVmapMaxTensorDims = 64;
+// The valid vmap levels range from [0, 64). This effectively means that we
+// support a maximum of 64 nested vmaps.
+constexpr int64_t kVmapNumLevels = 64;
+// Store this number of elements of BatchDims on the stack. Most people will
+// probably use <= 5 nested vmaps, but adjust this number as necessary.
+constexpr int64_t kBatchDimsStackSize = 5;
+// A BatchedTensorImpl holds an underlying Tensor and a single batch dim
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+//
+// The batch dimensions are treated as being "private"; they are not user-visible.
+// For example, in the following Tensor,
+//    bt = BatchedTensorImpl(ones(2, 3, 5, 7), lvl=1, dim=0)
+// dimension 0 is batch dimension.
+//
+// bt.sizes() returns (5, 7); bt.sum(0) performs a reduction over the (public)
+// dim 0, which is equivalent to dim 3 in the underlying ones(2, 3, 5, 7) tensor.
+struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
+  explicit BatchedTensorImpl(at::DispatchKeySet key_set, Tensor value, int64_t dim, int64_t level);
+  // Returns batch dimension of this tensor
+  int64_t bdim() const { return bdim_; }
+  // Returns batch dimension of this tensor
+  int64_t level() const { return level_; }
+  // BatchedTensorImpl wraps a Tensor
+  const Tensor& value() const { return value_; }
+  // Given a public dimension index, return the dimension index in the underlying
+  // value() tensor.
+  // For example, if we have
+  //    bt = BatchedTensorImpl(ones(2, 3, 5, 7), lvl=1, dim=0)
+  // bt.actualDim(0) -> 1
+  // bt.actualDim(1) -> 2
+  // bt.actualDim(2) -> 3
+  // bt.actualDim(3) -> Error
+  int64_t actualDim(int64_t dim, bool wrap_dim = true) const;
+  IntArrayRef sizes_custom() const override;
+  SymIntArrayRef sym_sizes_custom() const override;
+  int64_t size_custom(int64_t d) const override;
+  c10::SymInt sym_size_custom(int64_t d) const override;
+  // We have to override this because we opted into CustomStrides
+  IntArrayRef strides_custom() const override;
+  SymIntArrayRef sym_strides_custom() const override;
+  // Override a bunch of methods inherited from TensorImpl to return error messages.
+  c10::SymBool sym_is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  void set_size(int64_t dim, int64_t new_size) override;
+  void set_stride(int64_t dim, int64_t new_stride) override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) const override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+#ifdef DEBUG
+  bool has_storage() const override;
+#endif
+  void refreshTensorMetadata();
+  // Used in torchdim. torchdim uses non-lexical BatchedTensor; the way it
+  // accomplishes this is a hack where it is able to modify the levels of
+  // BatchedTensor to match the level of the current vmap transform.
+  void _unsafe_set_level(int64_t level) {
+    level_ = level;
+  }
+  // Used in batching rule for in-place view operations that can change
+  // the index of the bdim (think squeeze_, unsqueeze_)
+  void unsafe_set_bdim(int64_t bdim) {
+    // NB: you MUST call refreshTensorMetadata after doing this.
+    bdim_ = bdim;
+  }
+ private:
+  // see NOTE: [BatchedTensorImpl levels invariant]
+  void checkInvariants() const;
+  const char* tensorimpl_type_name() const override;
+  Tensor value_;
+  int64_t level_;
+  int64_t bdim_;
+};
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+inline bool isBatchedTensor(const Tensor& tensor) {
+  return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::FuncTorchBatched) ||
+      tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::BatchedNestedTensor);
+}
+// It is unsafe to call this on a Tensor that is not backed by a
+// BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible.
+inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) {
+  return static_cast<BatchedTensorImpl*>(tensor.unsafeGetTensorImpl());
+}
+inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) {
+  if (!isBatchedTensor(tensor)) {
+    return nullptr;
+  }
+  return unsafeGetBatchedImpl(tensor);
+}
+// Returns a bitset. If bit i is set, then that means dim i is a batchdim.
+inline std::bitset<kVmapMaxTensorDims> createBatchDimBitset(int64_t dim) {
+  std::bitset<kVmapMaxTensorDims> is_bdim;
+  is_bdim.set(dim);
+  return is_bdim;
+}
+// Creates a bitset for the given level
+inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(int64_t level) {
+  std::bitset<kVmapNumLevels> result;
+  result.set(level);
+  return result;
+}
+// Use this to construct a BatchedTensor from a regular Tensor
+TORCH_API Tensor makeBatched(Tensor tensor, int64_t dim, int64_t level);
+// Adds a batch dim to `tensor`, returning a BatchedTensor
+TORCH_API Tensor addBatchDim(Tensor tensor, int64_t dim, int64_t level);
+// Certain dispatch keys must be propagated to the BatchedTensor (or, in general,
+// any wrapper Tensor subclasses). This is because there are methods on Tensor
+// that skip dispatch and check for the presence of a dispatch key (e.g. is_cpu()).
+// TODO: should probably contain more (or all?) backend keys
+constexpr DispatchKeySet kKeysToPropagateToWrapper({
+  DispatchKey::Negative,
+  DispatchKey::Conjugate,
+  DispatchKey::XLA,
+  DispatchKey::XPU,
+  DispatchKey::HPU,
+  DispatchKey::CUDA,
+  DispatchKey::CPU,
+  DispatchKey::PrivateUse1,
+  DispatchKey::SparseCPU,
+  DispatchKey::SparseCUDA,
+  DispatchKey::SparseCsrCPU,
+  DispatchKey::SparseCsrCUDA,
+});
+inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) {
+  auto key_set = tensor.unsafeGetTensorImpl()->key_set();
+  return key_set & kKeysToPropagateToWrapper;
+}
+} // namespace at::functorch
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchingMetaprogramming.h ADDED Viewed

	@@ -0,0 +1,131 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <ATen/Tensor.h>
+#include <ATen/VmapGeneratedPlumbing.h>
+// This file contains template metaprogramming things that are used for our
+// batching rules.
+//
+// See NOTE: [vmap plumbing] for more details on why this is necessary.
+// The plumbing has a bunch of metaprogramming hacks for determining the signature
+// of a batching rule from the signature of the operator, many of which use the
+// helper functions in this file.
+namespace at::functorch {
+// Metaprogramming things
+template <class... Items> using typelist = c10::guts::typelist::typelist<Items...>;
+template <class TypeList> using head_t = c10::guts::typelist::head_t<TypeList>;
+template <class TL1, class TL2> using concat_t = c10::guts::typelist::concat_t<TL1, TL2>;
+template <typename T> class debug_t;
+// tail operation
+template<class TypeList>
+struct tail final {
+    static_assert(c10::guts::false_t<TypeList>::value,
+                  "In typelist::tail<T>, the T argument must be typelist<...>.");
+};
+template<class Head, class... Tail>
+struct tail<typelist<Head, Tail...>> final {
+  using type = typelist<Tail...>;
+};
+template<class TypeList> using tail_t = typename tail<TypeList>::type;
+template <class First, class Second, class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext {
+  using type = Next;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<Tensor, std::optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<const Tensor&, std::optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<Tensor&, std::optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<std::optional<Tensor>, std::optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<const std::optional<Tensor>&, std::optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<std::optional<Tensor>&, std::optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class Next, class Tail>
+struct IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<std::vector<Tensor>, std::optional<int64_t>, Next, Tail> {
+  using type = Tail;
+};
+template <class TypeList> struct RemoveBatchDimAfterTensor {
+  using first = head_t<TypeList>;
+  using next = tail_t<TypeList>;
+  using second = head_t<next>;
+  using tail = tail_t<next>;
+  using type = concat_t<
+    typelist<first>,
+    typename RemoveBatchDimAfterTensor<
+      typename IfFirstIsTensorAndSecondisBatchDimThenTailElseNext<first, second, next, tail>::type
+    >::type
+  >;
+};
+template <class Type> struct RemoveBatchDimAfterTensor<typelist<Type>> {
+  using type = typelist<Type>;
+};
+template <> struct RemoveBatchDimAfterTensor<typelist<>> {
+  using type = typelist<>;
+};
+template<class TypeList> using remove_batch_dim_after_tensor_t = typename RemoveBatchDimAfterTensor<TypeList>::type;
+template <typename T> struct UnpackSingleItemTuple {
+  using type = T;
+};
+template <typename T> struct UnpackSingleItemTuple<std::tuple<T>> {
+  using type = T;
+};
+template <typename T> using unpack_single_item_tuple_t = typename UnpackSingleItemTuple<T>::type;
+template <typename Return, typename TupleArgs> struct BuildFunctionHelper;
+template <typename Return, typename... Args> struct BuildFunctionHelper<Return, std::tuple<Args...>> {
+  using type = Return(Args...);
+};
+template <typename Return, typename TL>
+struct BuildFunction {
+  using type = typename BuildFunctionHelper<Return, c10::guts::typelist::to_tuple_t<TL>>::type;
+};
+template <typename Return, typename TL> using build_function_t = typename BuildFunction<Return, TL>::type;
+template <typename batch_rule_t> struct ToOperatorType {
+  using batch_rule_return_type = typename c10::guts::function_traits<batch_rule_t>::return_type;
+  using batch_rule_parameter_types = typename c10::guts::function_traits<batch_rule_t>::parameter_types;
+  using operator_parameter_types = remove_batch_dim_after_tensor_t<batch_rule_parameter_types>;
+  using operator_return_type =
+    unpack_single_item_tuple_t<
+      c10::guts::typelist::to_tuple_t<
+        remove_batch_dim_after_tensor_t<
+          c10::guts::typelist::from_tuple_t<batch_rule_return_type>>>>;
+  using type = build_function_t<operator_return_type, operator_parameter_types>;
+};
+template <typename batch_rule_t> using to_operator_t = typename ToOperatorType<batch_rule_t>::type;
+} // namespace at::functorch
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/DynamicLayer.h ADDED Viewed

	@@ -0,0 +1,129 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <ATen/functorch/Macros.h>
+#include <c10/core/DispatchKey.h>
+#include <ATen/core/function_schema.h>
+#include <optional>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <ATen/functorch/Interpreter.h>
+#include <ATen/functorch/VmapInterpreter.h>
+#include <ATen/functorch/ADInterpreters.h>
+#include <ATen/functorch/FunctionalizeInterpreter.h>
+// Forward declared
+namespace c10 { struct AutogradMetaInterface; }
+namespace at::functorch  {
+// This file contains the implementation of functorch's interpreter stack.
+// See NOTE: [functorch interpreter stack] first before reading on.
+//
+// NB: the functorch interpreter stack is also referred to as:
+// - the "dynamic layer stack" -- an older name for "interpreter" was
+//   "dynamic layer".
+// - the "functorch mode stack". You can think of each functorch transform as a
+//   "mode" (in the same sense as torch_dispatch mode or torch_function mode),
+//   and functorch being an implementation of a "mode stack" where the modes
+//   may be arbitrary composed.
+// DynamicLayer is basically the same thing as an Interpreter.
+// It represents a functorch transform and it holds an Interpreter,
+// which contains metadata related to the transform and instructions on
+// how to perform the transform.
+//
+// TODO: we can excise DynamicLayer in favor of Interpreter,
+// But I am going to leave it for now as a compatibility shim to avoid
+// needing to refactor a lot of callsites...
+struct TORCH_API DynamicLayer {
+  explicit DynamicLayer(
+      TransformType transform_type,
+      int64_t layerId,
+      std::optional<c10::SymInt> batchSize = std::nullopt,
+      std::optional<RandomnessType> randomness = std::nullopt,
+      std::optional<bool> prev_grad_mode = std::nullopt,
+      std::optional<bool> pre_fwd_grad_mode = std::nullopt,
+      std::optional<bool> functionalize_add_back_views = std::nullopt);
+  TransformType key() const;
+  int64_t layerId() const;
+  const Interpreter& interpreter() const { return interpreter_; }
+  Interpreter& interpreter() { return interpreter_; }
+  // Only valid for vmap
+  c10::SymInt batchSize() const;
+  RandomnessType randomness() const;
+ private:
+  Interpreter interpreter_;
+};
+TORCH_API int64_t initAndPushDynamicLayer(
+    TransformType transform_type,
+    std::optional<c10::SymInt> batch_size = std::nullopt,
+    std::optional<RandomnessType> randomness = std::nullopt,
+    std::optional<bool> prev_grad_mode = std::nullopt,
+    std::optional<bool> prev_fwd_grad_mode = std::nullopt,
+    std::optional<bool> functionalize_add_back_views = std::nullopt);
+TORCH_API DynamicLayer popDynamicLayerAndDeleteMetadata();
+TORCH_API std::optional<DynamicLayer> maybeCurrentDynamicLayer();
+TORCH_API const std::vector<DynamicLayer>& getDynamicLayerStack();
+TORCH_API void setDynamicLayerStack(const std::vector<DynamicLayer>& stack);
+TORCH_API void setDynamicLayerFrontBackKeysIncluded(bool included);
+// NOTE: [Life handles and lexically scoped transforms]
+// functorch transforms are lexically scoped.
+// Given a level, we store a "life handle" that is a boolean that tells us if the
+// transform with that level is active or not.
+//
+// functorch's TensorWrapper (for grad transforms) stores a life handle.
+// If a TensorWrapper escapes from the scope of the transform, then somehow
+// it must know it escaped; it can tell by querying the life handle.
+TORCH_API const std::shared_ptr<bool>& getLifeHandleForLevel(int64_t level);
+// Returns if an operator is in-place. An operator is inplace if:
+// 1. The first argument is a Tensor and it is being written to
+// 2. The first argument is being returned
+// 3. No other arguments are aliased
+// Here is an example of an in-place operator:
+// add_(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
+TORCH_API bool isInplaceOp(const c10::FunctionSchema& schema);
+// Given the indices of unwrapped inputs and the schema, this returns the indices of any outputs that should remain unwrapped
+TORCH_API std::optional<size_t> findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input);
+TORCH_API Tensor unwrapIfDead(const Tensor& tensor);
+TORCH_API bool isDeadTensorWrapper(const Tensor& tensor);
+// Pretty printers
+TORCH_API std::ostream& operator<<(std::ostream& os, const DynamicLayer& layer);
+TORCH_API std::ostream& operator<<(std::ostream& os, const std::vector<DynamicLayer>& dynamicLayerStack);
+// While a functorch transform is active, torch.autograd.function._SingleLevelFunction
+// is disabled by default. The following two APIs are APIs for enabling
+// it. These are not user-facing APIs. We can delete this in the future, but
+// it is useful for debugging when something goes wrong with the
+// autograd.Function <> functorch interaction, which uses _SingleLevelFunction,
+// because it leads to loud errors if something is incorrect.
+TORCH_API void setSingleLevelAutogradFunctionAllowed(bool allowed);
+TORCH_API bool getSingleLevelAutogradFunctionAllowed();
+// While a functorch grad transform is active, Tensor.requires_grad_() gets
+// disabled. These two functions are the mechanism to controlling that.
+TORCH_API void setInplaceRequiresGradAllowed(bool allowed);
+TORCH_API bool getInplaceRequiresGradAllowed();
+TORCH_API DynamicLayer popDynamicLayer();
+TORCH_API int64_t pushDynamicLayer(DynamicLayer&& layer);
+} // namespace at::functorch
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/functorch/Interpreter.h>
+namespace at::functorch {
+// This is the interpreter that handles the functionalize() transform.
+// See NOTE: [functorch interpreter stack] for more details.
+struct FunctionalizeInterpreterPtr {
+  explicit FunctionalizeInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Functionalize); }
+  TransformType key() const { return base_->key(); }
+  int64_t level() const { return base_->level(); }
+  void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+  void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case);
+  bool functionalizeAddBackViews() const {
+    return std::get<FunctionalizeInterpreterMeta>(base_->meta()).functionalizeAddBackViews_;
+  }
+ private:
+  const Interpreter* base_;
+};
+} // namespace at::functorch
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/functorch/Interpreter.h ADDED Viewed

	@@ -0,0 +1,358 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/functorch/Macros.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/util/Exception.h>
+#include <optional>
+#include <bitset>
+#include <utility>
+#include <variant>
+#include <nlohmann/json.hpp>
+namespace at::functorch {
+// NOTE: [functorch interpreter stack]
+//
+// functorch's dispatching system uses a stack of interpreters.
+// Historically we've referred to this as the "DynamicLayerStack".
+//
+// An interpreter is something that reads in the code it is passed
+// and then executes it. We have a different interpreter per-transform:
+// the "VmapInterpreter" is responsible for reading in operators (like aten::mv)
+// and executing the batched version of it (the batching rule for aten::mv).
+//
+// Concretely, each interpreter is responsible for two things:
+//
+// 1) process(ophandle, stack)
+// Given an operator handle and a stack of arguments, the interpreter is
+// responsible for figuring out how to execute the operation under the semantics
+// of the interpreter. For e.g. VmapInterpreter, this is figuring out how to call
+// the batching rule.
+//
+// The batching rules are stored as kernels on the FuncTorchBatched key, so the way
+// VmapInterpreter calls the batching rule is roughly: (A) exclude all
+// dispatch keys aside from the Batched key, (B) redispatch so we get to the
+// Batched key.
+//
+// 2) sendToNextInterpreter(ophandle, stack)
+// The VmapInterpreter, when it sees aten::mv, will process it into a call to
+// aten::mm. It then needs to send the call to aten::mm to the next interpreter
+// in the interpreter stack.
+//
+// The VmapInterpreter just does this via a call to ophandle.callBoxed(stack)
+// and most Interpreters will implement it this way.
+enum class RandomnessType {
+    Error,      // always errors when calling a random function
+    Same,       // randomness appears the same across batches
+    Different,  // randomness appears different across batches
+    END
+};
+enum class TransformType {
+  Torch,  // Unused
+  Vmap,
+  Grad,  // reverse-mode AD, aka vjp
+  Jvp,  // forward-mode AD
+  Functionalize,
+};
+std::ostream& operator<<(std::ostream& os, const TransformType& t);
+// NOTE: [Interpreter "subclassing" design]
+//
+// How are various Interpreters for different transforms (vmap, grad, ...)
+// implemented?
+//
+// Accessing interpreters is in the hot-path of functorch so we have a constraint
+// that this code must be as fast as possible.
+//
+// As a result, we stay away from virtual methods and this causes our code
+// to look a little funny.
+//
+// `Interpreter` is the struct for Interpreters. It holds ALL of the
+// relevant information (what type of interpreter it is and the metadata).
+// Metadata for each interpreter is represented as a Union (std::variant)
+// of all possible metadata (VmapInterpreterMeta, GradInterpreterMeta, ...).
+//
+// Given an Interpreter, how do I get a "VmapInterpreter"? You may wish to do this
+// if you want to access the metadata fields (like batchSize and randomness).
+//
+// Each type of interpreter (e.g. Vmap) has a convenience struct
+// (e.g. VmapInterpreterPtr) associated with it.
+//
+// Construct the convenience struct with VmapInterpreterPtr(Interpreter*),
+// and then one can access methods on VmapInterpreterPtr like so:
+// >>> VmapInterpreterPtr(&interpreter).batchSize()
+//
+// Finally, Interpreter::process switches on the type of the interpreter
+// and calls one of {Transform}Interpreter::processImpl under the hood.
+// Same for Interpreter::sendToNextInterpreter :)
+struct VmapInterpreterMeta {
+  explicit VmapInterpreterMeta(c10::SymInt batchSize, RandomnessType randomness) :
+    batchSize_(std::move(batchSize)), randomness_(randomness) {}
+  c10::SymInt batchSize_;
+  RandomnessType randomness_;
+  VmapInterpreterMeta() = default;
+  VmapInterpreterMeta(const VmapInterpreterMeta&) = default;
+  VmapInterpreterMeta(VmapInterpreterMeta&&) = default;
+  VmapInterpreterMeta& operator=(const VmapInterpreterMeta&) = default;
+  VmapInterpreterMeta& operator=(VmapInterpreterMeta&&) = default;
+  ~VmapInterpreterMeta() = default;
+  template <typename T>
+  friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) {
+    TORCH_CHECK(
+      !json_t.batchSize_.is_heap_allocated(),
+      "Serialization for heap-allocated SymInt is not implemented yet"
+    );
+    json_j["batchSize"] = json_t.batchSize_.as_int_unchecked();
+    json_j["randomness"] = static_cast<int64_t>(json_t.randomness_);
+  }
+  template <typename T>
+  friend void from_json(const T& json_j, VmapInterpreterMeta& json_t) {
+    json_t.batchSize_ = c10::SymInt(SymInt::Unchecked::UNCHECKED, json_j["batchSize"]);
+    json_t.randomness_ = static_cast<RandomnessType>(json_j["randomness"]);
+  }
+};
+struct GradInterpreterMeta {
+  explicit GradInterpreterMeta(bool prevGradMode): prevGradMode_(prevGradMode) {}
+  GradInterpreterMeta() = default;
+  GradInterpreterMeta(const GradInterpreterMeta&) = default;
+  GradInterpreterMeta(GradInterpreterMeta&&) = default;
+  GradInterpreterMeta& operator=(const GradInterpreterMeta&) = default;
+  GradInterpreterMeta& operator=(GradInterpreterMeta&&) = default;
+  ~GradInterpreterMeta() = default;
+  bool prevGradMode_;
+  template <typename T>
+  friend void to_json(T& json_j, const GradInterpreterMeta& json_t) {
+    json_j["prevGradMode"] = json_t.prevGradMode_;
+  }
+  template <typename T>
+  friend void from_json(const T& json_j, GradInterpreterMeta& json_t) {
+    json_t.prevGradMode_ = json_j["prevGradMode"];
+  }
+};
+struct JvpInterpreterMeta {
+  explicit JvpInterpreterMeta(bool prevFwdGradMode) : prevFwdGradMode_(prevFwdGradMode) {}
+  JvpInterpreterMeta() = default;
+  JvpInterpreterMeta(const JvpInterpreterMeta&) = default;
+  JvpInterpreterMeta(JvpInterpreterMeta&&) = default;
+  JvpInterpreterMeta& operator=(const JvpInterpreterMeta&) = default;
+  JvpInterpreterMeta& operator=(JvpInterpreterMeta&&) = default;
+  ~JvpInterpreterMeta() = default;
+  bool prevFwdGradMode_;
+  template <typename T>
+  friend void to_json(T& json_j, const JvpInterpreterMeta& json_t) {
+    json_j["prevFwdGradMode"] = json_t.prevFwdGradMode_;
+  }
+  template <typename T>
+  friend void from_json(const T& json_j, JvpInterpreterMeta& json_t) {
+    json_t.prevFwdGradMode_ = json_j["prevFwdGradMode"];
+  }
+};
+struct FunctionalizeInterpreterMeta {
+  explicit FunctionalizeInterpreterMeta(bool functionalizeAddBackViews) :
+    functionalizeAddBackViews_(functionalizeAddBackViews) {}
+  FunctionalizeInterpreterMeta() = default;
+  FunctionalizeInterpreterMeta(const FunctionalizeInterpreterMeta&) = default;
+  FunctionalizeInterpreterMeta(FunctionalizeInterpreterMeta&&) = default;
+  FunctionalizeInterpreterMeta& operator=(const FunctionalizeInterpreterMeta&) = default;
+  FunctionalizeInterpreterMeta& operator=(FunctionalizeInterpreterMeta&&) = default;
+  ~FunctionalizeInterpreterMeta() = default;
+  bool functionalizeAddBackViews_;
+  template <typename T>
+  friend void to_json(T& json_j, const FunctionalizeInterpreterMeta& json_t) {
+    json_j["functionalizeAddBackViews"] = json_t.functionalizeAddBackViews_;
+  }
+  template <typename T>
+  friend void from_json(const T& json_j, FunctionalizeInterpreterMeta& json_t) {
+    json_t.functionalizeAddBackViews_ = json_j["functionalizeAddBackViews"];
+  }
+};
+typedef std::variant<
+  int64_t,
+  GradInterpreterMeta,
+  JvpInterpreterMeta,
+  VmapInterpreterMeta,
+  FunctionalizeInterpreterMeta
+> InterpreterMeta;
+struct Interpreter {
+  // factory functions
+  static Interpreter Vmap(int64_t level, c10::SymInt batchSize, RandomnessType randomness) {
+    return Interpreter(TransformType::Vmap, level, VmapInterpreterMeta(std::move(batchSize), randomness));
+  }
+  static Interpreter Grad(int64_t level, bool prevGradMode) {
+    return Interpreter(TransformType::Grad, level, GradInterpreterMeta(prevGradMode));
+  }
+  static Interpreter Jvp(int64_t level, bool prevFwdGradMode) {
+    return Interpreter(TransformType::Jvp, level, JvpInterpreterMeta(prevFwdGradMode));
+  }
+  static Interpreter Functionalize(int64_t level, bool functionalizeAddBackViews) {
+    return Interpreter(TransformType::Functionalize, level, FunctionalizeInterpreterMeta(functionalizeAddBackViews));
+  }
+  // methods
+  TransformType key() const { return type_; }
+  int64_t level() const { return level_; }
+  const InterpreterMeta& meta() const { return meta_; }
+  void process(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+  void sendToNextInterpreter(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case);
+  void saveLocalDispatchKeySet(c10::impl::LocalDispatchKeySet keyset) {
+    TORCH_INTERNAL_ASSERT(!savedLocalDispatchKeySet_.has_value());
+    savedLocalDispatchKeySet_ = keyset;
+  }
+  void clearSavedLocalDispatchKeySet() {
+    TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value());
+    savedLocalDispatchKeySet_ = std::nullopt;
+  }
+  c10::impl::LocalDispatchKeySet getSavedLocalDispatchKeySet() const {
+    TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value());
+    return *savedLocalDispatchKeySet_;
+  }
+  // An Interpreter is alive if we are currently inside the ongoing transform
+  // for the interpreter. For example, vmap(f)(x); inside of f, the vmap's
+  // corresponding Interpreter is alive, even when it is not on the DynamicLayerStack.
+  bool is_alive() const {
+    return *is_alive_;
+  }
+  const std::shared_ptr<bool>& is_alive_ptr() const {
+    return is_alive_;
+  }
+  void set_is_alive(bool alive) {
+    *is_alive_ = alive;
+  }
+  // Please don't use this
+  explicit Interpreter() = default;
+  template <typename T>
+  friend void to_json(T& json_j, const Interpreter& json_t) {
+    json_j["type"] = static_cast<int64_t>(json_t.type_);
+    json_j["level"] = json_t.level_;
+    if (json_t.savedLocalDispatchKeySet_) {
+      json_j["savedLocalDispatchKeySet"] = {
+        {"included", json_t.savedLocalDispatchKeySet_->included_.raw_repr()},
+        {"excluded", json_t.savedLocalDispatchKeySet_->excluded_.raw_repr()}
+      };
+    } else {
+      json_j["savedLocalDispatchKeySet"] = nlohmann::json();
+    }
+    json_j["is_alive"] = *json_t.is_alive_;
+    std::visit([&](auto&& arg) {
+        using V = std::decay_t<decltype(arg)>;
+        if constexpr (std::is_same_v<V, int64_t>) {
+          json_j["meta"] = {{"Torch", arg}};
+        } else if constexpr (std::is_same_v<V, GradInterpreterMeta>) {
+          json_j["meta"] = {{"Grad", arg}};
+        } else if constexpr (std::is_same_v<V, JvpInterpreterMeta>) {
+          json_j["meta"] = {{"Jvp", arg}};
+        } else if constexpr (std::is_same_v<V, VmapInterpreterMeta>) {
+          json_j["meta"] = {{"Vmap", arg}};
+        } else if constexpr (std::is_same_v<V, FunctionalizeInterpreterMeta>) {
+          json_j["meta"] = {{"Functionalize", arg}};
+        } else {
+          static_assert(false && sizeof(V), "unknown variant case");
+        }
+    }, json_t.meta_);
+  }
+  template <typename T>
+  friend void from_json(const T& json_j, Interpreter& json_t) {
+    json_t.type_ = static_cast<TransformType>(json_j["type"]);
+    json_t.level_ = json_j["level"];
+    auto savedLocalDispatchKeySet = json_j["savedLocalDispatchKeySet"];
+    if (savedLocalDispatchKeySet.is_null()) {
+      json_t.savedLocalDispatchKeySet_ = std::nullopt;
+    } else {
+      c10::impl::PODLocalDispatchKeySet pod;
+      pod.set_included(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["included"].template get<uint64_t>()));
+      pod.set_excluded(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["excluded"].template get<uint64_t>()));
+      json_t.savedLocalDispatchKeySet_ = c10::impl::LocalDispatchKeySet(pod);
+    }
+    json_t.is_alive_ = std::make_shared<bool>(json_j["is_alive"]);
+    auto meta = json_j["meta"];
+    if (meta.contains("Torch")) {
+      json_t.meta_.emplace<int64_t>(meta["Torch"].template get<int64_t>());
+    } else if (meta.contains("Grad")) {
+      json_t.meta_.emplace<GradInterpreterMeta>(meta["Grad"].template get<GradInterpreterMeta>());
+    } else if (meta.contains("Jvp")) {
+      json_t.meta_.emplace<JvpInterpreterMeta>(meta["Jvp"].template get<JvpInterpreterMeta>());
+    } else if (meta.contains("Vmap")) {
+      json_t.meta_.emplace<VmapInterpreterMeta>(meta["Vmap"].template get<VmapInterpreterMeta>());
+    } else if (meta.contains("Functionalize")) {
+      json_t.meta_.emplace<FunctionalizeInterpreterMeta>(meta["Functionalize"].template get<FunctionalizeInterpreterMeta>());
+    } else {
+      TORCH_CHECK(false, "unknown interpreter metadata type");
+    }
+  }
+  std::string serialize() const {
+    return nlohmann::json(*this).dump();
+  }
+  static Interpreter deserialize(const std::string& serialized) {
+    return nlohmann::json::parse(serialized).get<Interpreter>();
+  }
+ private:
+  explicit Interpreter(TransformType type, int64_t level, InterpreterMeta meta):
+    type_(type), level_(level), is_alive_(std::make_shared<bool>(false)), meta_(std::move(meta)) {}
+  // fields
+  TransformType type_{};
+  int64_t level_{};
+  std::optional<c10::impl::LocalDispatchKeySet> savedLocalDispatchKeySet_;
+  std::shared_ptr<bool> is_alive_;
+  InterpreterMeta meta_;
+};
+// Applies the following for-loop:
+// for i in range(begin, end):
+//   args[i] = func(args[i])
+void foreachTensorInplace(std::vector<IValue>& args, int64_t begin, int64_t end,
+    std::function<Tensor(const Tensor&)> func);
+// Applies the following for-loop:
+// for i in range(begin, end):
+//   if use_flag_relative[i] == 1: <-- treats use_flag_relative as a bitset
+//     args[i] = func(args[i], i - begin, true)
+//   args[i] = func(args[i], i - begin)
+void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int64_t end,
+    const std::bitset<64> use_flag_relative, const std::function<Tensor(const Tensor&, bool)>& func);
+std::vector<int64_t> findUnwrappedInputs(std::vector<IValue>& args, int64_t begin, int64_t end);
+DispatchKeySet keysToExcludeWhenEnteringDynamicLayer(TransformType key);
+void setup_dispatch_key_tls(TransformType key, DispatchKeySet include);
+void sanityCheckStack(const c10::OperatorHandle& op, torch::jit::Stack* stack);
+} // namespace at::functorch
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)