BryanW commited on Mar 23

Commit

dcb4c75

verified ·

1 Parent(s): 76cbda0

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ArrayRef.h +7 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Backend.h +7 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CPUApplyUtils.h +356 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CPUFixedAllocator.h +38 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CPUGeneratorImpl.h +54 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CUDAFunctions.h +34 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CollapseDims.h +99 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h +30 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Config.h +28 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Device.h +7 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/DeviceAccelerator.h +118 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/DimVector.h +7 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Dispatch_v2.h +182 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/DynamicLibrary.h +41 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/EmptyTensor.h +171 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ExpandUtils.h +540 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/FunctionalTensorWrapper.h +476 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Functions.h +1476 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/InitialTensorOptions.h +20 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h +166 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/LegacyVmapMode.h +31 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/LegacyVmapTransforms.h +188 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/MethodOperators.h +449 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/NamedTensor.h +6 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/NativeMetaFunctions.h +1352 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/NestedTensorImpl.h +292 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/NumericUtils.h +208 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ParallelOpenMP.h +59 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/RedispatchFunctions.h +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/RegistrationDeclarations.h +0 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/SDPBackend.h +21 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Scalar.h +8 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/StorageUtils.h +54 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/TensorAccessor.h +7 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ThreadLocalPythonObjects.h +26 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ThreadLocalState.h +131 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Utils.h +143 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpp_custom_type_hack.h +115 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/THC/THCAtomics.cuh +8 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/THC/THCDeviceUtils.cuh +8 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/ConvUtils.h +195 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/Fbgemm.h +1515 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmBuild.h +116 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmConvert.h +205 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmEmbedding.h +383 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmFP16.h +60 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmFP32.h +54 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmFPCommon.h +319 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmI64.h +36 -0
URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmI8DepthwiseAvx2.h +117 -0

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ArrayRef.h ADDED Viewed

	@@ -0,0 +1,7 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/util/ArrayRef.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Backend.h ADDED Viewed

	@@ -0,0 +1,7 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/Backend.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CPUApplyUtils.h ADDED Viewed

	@@ -0,0 +1,356 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/CollapseDims.h>
+#include <ATen/Parallel.h>
+#include <ATen/TensorUtils.h>
+#include <c10/util/irange.h>
+#include <cstring>
+#include <limits>
+namespace at {
+/*
+ * The basic strategy for apply is as follows:
+ *
+ * 1. Starting with the outermost index, loop until we reach a dimension where
+ * the data is no longer contiguous, i.e. the stride at that dimension is not
+ * equal to the size of the tensor defined by the outer dimensions. Let's call
+ * this outer (contiguous) tensor A. Note that if the Tensor is contiguous, then
+ * A is equal to the entire Tensor. Let's call the inner tensor B.
+ *
+ * 2. We loop through the indices in B, starting at its outermost dimension. For
+ * example, if B is a 2x2 matrix, then we do:
+ *
+ * B[0][0]
+ * B[0][1]
+ * B[1][0]
+ * B[1][1]
+ *
+ * We set the offset into the underlying storage as (storageOffset + stride_B *
+ * index_B), i.e. basically we compute the offset into the storage as we would
+ * normally for a Tensor. But because we are guaranteed the subsequent data is
+ * contiguous in memory, we can simply loop for sizeof(A) iterations and perform
+ * the operation, without having to follow the order described by the strides of
+ * A.
+ *
+ * 3. As an optimization, we merge dimensions of A that are contiguous in
+ * memory. For example, if A is a 3x3x3x3 tensor narrowed from a 3x3x4x3 tensor,
+ * then the first two dimensions can be merged for the purposes of APPLY,
+ * reducing the number of nested loops.
+ */
+inline Tensor sort_strides(Tensor& tensor_) {
+  IntArrayRef strides = tensor_.strides();
+  std::vector<int64_t> indices;
+  indices.reserve(tensor_.ndimension());
+  for (const auto i : c10::irange(tensor_.ndimension())) {
+    indices.push_back(i);
+  }
+  std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) {
+    return strides[i1] > strides[i2];
+  });
+  Tensor tensor = tensor_.permute(indices);
+  return tensor;
+}
+template <typename T, int N>
+struct strided_tensor_iter_fixed {
+ public:
+  T* data_ = NULL;
+  int64_t dim_ = 0;
+  // NOLINTNEXTLINE(*array*)
+  int64_t counter_[N] = {0};
+  // NOLINTNEXTLINE(*array*)
+  int64_t sizes_[N] = {0};
+  // NOLINTNEXTLINE(*array*)
+  int64_t strides_[N] = {0};
+  strided_tensor_iter_fixed(strided_tensor_iter_fixed const&) = delete;
+  strided_tensor_iter_fixed& operator=(strided_tensor_iter_fixed const& x) =
+      delete;
+  strided_tensor_iter_fixed(strided_tensor_iter_fixed&&) noexcept = default;
+  strided_tensor_iter_fixed& operator=(strided_tensor_iter_fixed&& x) noexcept =
+      default;
+  ~strided_tensor_iter_fixed() noexcept = default;
+  strided_tensor_iter_fixed(
+      Tensor& tensor,
+      [[maybe_unused]] bool sort_strides = false)
+      : data_(tensor.data_ptr<T>()) {
+    std::memset(counter_, 0, sizeof(int64_t) * N);
+    if (tensor.dim() > 0) {
+      std::memcpy(
+          sizes_, tensor.sizes().data(), tensor.dim() * sizeof(int64_t));
+      std::memcpy(
+          strides_, tensor.strides().data(), tensor.dim() * sizeof(int64_t));
+    }
+    dim_ = std::get<1>(collapse_dims(sizes_, strides_, tensor.ndimension()));
+  }
+};
+template <typename T>
+struct strided_tensor_iter {
+ private:
+ public:
+  T* data_ = NULL;
+  int64_t dim_;
+  std::vector<int64_t> counter_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+  strided_tensor_iter(strided_tensor_iter const&) = delete;
+  strided_tensor_iter& operator=(strided_tensor_iter const& x) = delete;
+  strided_tensor_iter(strided_tensor_iter&&) noexcept = default;
+  strided_tensor_iter& operator=(strided_tensor_iter&&) noexcept = default;
+  ~strided_tensor_iter() noexcept = default;
+  strided_tensor_iter(Tensor& tensor)
+      : data_(tensor.data_ptr<T>()),
+        dim_(tensor.ndimension()),
+        counter_(dim_, 0),
+        sizes_(tensor.sizes().vec()),
+        strides_(tensor.strides().vec()) {
+    dim_ = std::get<1>(collapse_dims(sizes_.data(), strides_.data(), dim_));
+  }
+};
+inline bool _all_equal_numel(at::ArrayRef<Tensor> tensors) {
+  if (tensors.empty())
+    return true;
+  int64_t all_numel = tensors[0].numel();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    if (tensors[i].numel() != all_numel)
+      return false;
+  }
+  return true;
+}
+inline std::string _all_equal_numel_error(at::ArrayRef<Tensor> tensors) {
+  std::ostringstream oss;
+  oss << "inconsistent tensor size, expected ";
+  for (size_t i = 0; i < tensors.size() - 1; i++) {
+    oss << tensors[i].sizes() << ", ";
+  }
+  oss << "and " << tensors[tensors.size() - 1].sizes()
+      << " to have the same number of elements, but got ";
+  for (size_t i = 0; i < tensors.size() - 1; i++) {
+    oss << tensors[i].numel() << ", ";
+  }
+  oss << "and " << tensors[tensors.size() - 1].numel()
+      << " elements respectively";
+  return oss.str();
+}
+inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
+  checkDeviceType("CPU_tensor_apply", tensors, kCPU);
+  checkLayout("CPU_tensor_apply", tensors, kStrided);
+  TORCH_CHECK(_all_equal_numel(tensors), _all_equal_numel_error(tensors));
+  // An empty tensor has no elements
+  for (auto& t : tensors)
+    if (t.numel() == 0)
+      return false;
+  return true;
+}
+inline int64_t _max_dim_tensors(ArrayRef<Tensor> tensors) {
+  int64_t dim = 0;
+  for (auto& t : tensors)
+    dim = std::max(dim, t.ndimension());
+  return dim;
+}
+inline void iterate(int64_t /*size*/) {}
+template <typename Arg, typename... Args>
+inline void iterate(int64_t size, Arg& iter, Args&... iter_tail) {
+  iter.counter_[iter.dim_ - 1] += size;
+  iter.data_ = iter.data_ + size * iter.strides_[iter.dim_ - 1];
+  iterate(size, iter_tail...);
+}
+inline bool iterate_continue() {
+  return true;
+}
+template <typename Arg, typename... Args>
+inline bool iterate_continue(Arg& iter, Args&... iter_tail) {
+  return iter.counter_[iter.dim_ - 1] < iter.sizes_[iter.dim_ - 1] &&
+      iterate_continue(iter_tail...);
+}
+inline int64_t max_iterate_size() {
+  return std::numeric_limits<int64_t>::max();
+}
+template <typename Arg, typename... Args>
+inline int64_t max_iterate_size(Arg& iter, Args&... iter_tail) {
+  return std::min(
+      (iter.sizes_[iter.dim_ - 1] - iter.counter_[iter.dim_ - 1]),
+      max_iterate_size(iter_tail...));
+}
+inline void iterate_overflow() {}
+template <typename Arg, typename... Args>
+inline void iterate_overflow(Arg& iter, Args&... iter_tail) {
+  if (iter.counter_[iter.dim_ - 1] == iter.sizes_[iter.dim_ - 1]) {
+    for (int64_t i = iter.dim_ - 1; i > 0; i--) {
+      if (iter.counter_[i] == iter.sizes_[i]) {
+        iter.counter_[i] = 0;
+        iter.counter_[i - 1]++;
+        iter.data_ = iter.data_ - (iter.sizes_[i] * iter.strides_[i]) +
+            iter.strides_[i - 1];
+      }
+    }
+  }
+  iterate_overflow(iter_tail...);
+}
+inline void forward(int64_t /*offset*/) {}
+template <typename Arg, typename... Args>
+inline void forward(int64_t offset, Arg& iter, Args&... iter_tail) {
+  int64_t multi = offset;
+  for (int64_t i = iter.dim_ - 1; i >= 0; i--) {
+    int64_t inc = multi % iter.sizes_[i];
+    multi = multi / iter.sizes_[i];
+    iter.data_ = iter.data_ + inc * iter.strides_[i];
+    iter.counter_[i] += inc;
+  }
+  forward(offset, iter_tail...);
+}
+inline int64_t max_dim() {
+  return 0;
+}
+template <typename Arg, typename... Args>
+inline int64_t max_dim(Arg& iter, Args&... iter_tail) {
+  return std::max(iter.dim_, max_dim(iter_tail...));
+}
+inline void apply_op() {}
+template <typename Op, typename... Args>
+inline void apply_op(
+    int64_t numel,
+    int64_t offset,
+    const Op& op,
+    Args... iters) {
+  // For 0-dim tensors
+  if (numel == 1 && max_dim(iters...) == 0) {
+    op(*iters.data_...);
+    return;
+  }
+  if (offset > 0)
+    forward(offset, iters...);
+  // Splitting this into chunks helps the compiler create faster assembly
+  for (int64_t i = 0; i < numel;) {
+    for (; iterate_continue(iters...) && i < numel;) {
+      op(*iters.data_...);
+      iterate(1, iters...);
+      i++;
+    }
+    iterate_overflow(iters...);
+  }
+}
+/*
+  Apply a pointwise operator to sequence of tensors
+  The calling convention for op is a function/functor that takes the same
+  number of pointers of type scalar as the number of given tensors. For example,
+  to compute a = b * c, op would be of the form:
+  [](scalar* a_val, const scalar* b_val, const scalar* c_val) { a_val[0] =
+  b_val[0] * c_val[0]; };
+*/
+template <typename scalar1, typename scalar2, typename Op>
+inline void CPU_tensor_apply2(Tensor tensor1, Tensor tensor2, const Op op) {
+  if (!_apply_preamble({tensor1, tensor2}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2));
+  }
+}
+template <typename scalar1, typename scalar2, typename scalar3, typename Op>
+inline void CPU_tensor_apply3(
+    Tensor tensor1,
+    Tensor tensor2,
+    Tensor tensor3,
+    const Op op) {
+  if (!_apply_preamble({tensor1, tensor2, tensor3}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2, tensor3}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2),
+        strided_tensor_iter_fixed<scalar3, 8>(tensor3));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2),
+        strided_tensor_iter<scalar3>(tensor3));
+  }
+}
+template <
+    typename scalar1,
+    typename scalar2,
+    typename scalar3,
+    typename scalar4,
+    typename Op>
+inline void CPU_tensor_apply4(
+    Tensor tensor1,
+    Tensor tensor2,
+    Tensor tensor3,
+    Tensor tensor4,
+    const Op op) {
+  if (!_apply_preamble({tensor1, tensor2, tensor3, tensor4}))
+    return;
+  if (_max_dim_tensors({tensor1, tensor2, tensor3, tensor4}) <= 8) {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter_fixed<scalar1, 8>(tensor1),
+        strided_tensor_iter_fixed<scalar2, 8>(tensor2),
+        strided_tensor_iter_fixed<scalar3, 8>(tensor3),
+        strided_tensor_iter_fixed<scalar4, 8>(tensor4));
+  } else {
+    apply_op(
+        tensor1.numel(),
+        0,
+        op,
+        strided_tensor_iter<scalar1>(tensor1),
+        strided_tensor_iter<scalar2>(tensor2),
+        strided_tensor_iter<scalar3>(tensor3),
+        strided_tensor_iter<scalar4>(tensor4));
+  }
+}
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CPUFixedAllocator.h ADDED Viewed

	@@ -0,0 +1,38 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/Allocator.h>
+#include <c10/util/Exception.h>
+// This file creates a fake allocator that just throws exceptions if
+// it is actually used.
+// state passed to the allocator is the std::function<void(void*)> called
+// when the blob is release by ATen
+namespace at {
+static void* cpu_fixed_malloc(void*, ptrdiff_t) {
+  TORCH_CHECK(false, "attempting to resize a tensor view of an external blob");
+}
+static void* cpu_fixed_realloc(void*, void*, ptrdiff_t) {
+  TORCH_CHECK(false, "attempting to resize a tensor view of an external blob");
+}
+static void cpu_fixed_free(void* state, void* allocation) {
+  auto on_release = static_cast<std::function<void(void*)>*>(state);
+  (*on_release)(allocation);
+  delete on_release;
+}
+static Allocator CPU_fixed_allocator = {
+    cpu_fixed_malloc,
+    cpu_fixed_realloc,
+    cpu_fixed_free};
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CPUGeneratorImpl.h ADDED Viewed

	@@ -0,0 +1,54 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/Generator.h>
+#include <ATen/core/MT19937RNGEngine.h>
+#include <c10/core/GeneratorImpl.h>
+#include <optional>
+namespace at {
+struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
+  // Constructors
+  CPUGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
+  ~CPUGeneratorImpl() override = default;
+  // CPUGeneratorImpl methods
+  std::shared_ptr<CPUGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  static c10::DeviceType device_type();
+  uint32_t random();
+  uint64_t random64();
+  std::optional<float> next_float_normal_sample();
+  std::optional<double> next_double_normal_sample();
+  void set_next_float_normal_sample(std::optional<float> randn);
+  void set_next_double_normal_sample(std::optional<double> randn);
+  at::mt19937 engine();
+  void set_engine(at::mt19937 engine);
+ private:
+  CPUGeneratorImpl* clone_impl() const override;
+  at::mt19937 engine_;
+  std::optional<float> next_float_normal_sample_;
+  std::optional<double> next_double_normal_sample_;
+};
+namespace detail {
+TORCH_API const Generator& getDefaultCPUGenerator();
+TORCH_API Generator
+createCPUGenerator(uint64_t seed_val = default_rng_seed_val);
+} // namespace detail
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CUDAFunctions.h ADDED Viewed

	@@ -0,0 +1,34 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <ATen/core/TensorBody.h>
+// TODO Undo all logic introduced for Note [Avoiding Include Cycles In Static Dispatch]
+// Code introduced to avoid cyclic dependency in static dispatch is no longer
+// needed as static dispatch logic is moved from TensorBody.h, which caused cycles in the first place,
+// to Operators.cpp for supporting multiple backends with multiple kernels.
+//
+// Note [Avoiding Include Cycles In Static Dispatch]
+// In order to avoid #include cycles in the static dispatch build, we've carefully split out
+// the static function definition files into {DispatchKey}Functions.h and {DispatchKey}Functions_inl.h.
+//
+// Without this split, the include cycle looks like TensorBody.h -> CPUFunctions.h -> TensorBody.h.
+// - TensorBody.h #includes CPUFunctions.h in the static dispatch build, because the tensor methods
+//   all need to call into the fastpath C++ API defined in CPUFunctions.h. The methods are also all
+//   directly inlined into TensorBody.h.
+// - CPUFunctions.h #includes TensorBody.h because it contains function declarations for the entire C++ API,
+//   which include functions that have defaultable std::optional<Tensor> arguments.
+//   That requires knowing the full Tensor class definition.
+//
+// We break the cycle by doing the following:
+// - Split out CPUFunction.h into two files: CPUFunctions.h and CPUFunctions_inl.h
+// - CPUFunction.h is a dummy file that just includes the Tensor class and includes CPUFunctions_inl.,
+// - CPUFunctions_inl.h includes everything else
+// - (only in the static dispatch build) TensorBody.h makes sure to finish defining the Tensor class,
+//   and then it includes CPUFunctions_inl.h.
+// - All other files that want the cpu fastpath functions can include CPUFunctions.h directly.
+// - This also means that static dispatch build, CPUFunctions.h only needs to
+//   #include TensorBody.h, and it will automatically bring in CPUFunctions_inl.h.
+#include <ATen/CUDAFunctions_inl.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CollapseDims.h ADDED Viewed

	@@ -0,0 +1,99 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <c10/util/Exception.h>
+#include <utility>
+namespace at {
+/*
+[collapse dims] Updates sizes, and strides to reflect a "collapse" of
+the info, possibly excluding the optional excludeDim. A "collapsed" version
+of the info is the fewest dims that order the tensor's elements in the same
+way as the original info. If excludeDim is specified, the collapse is the
+fewest dims that order the tensor's elements as the original and preserve the
+excluded dimension, unless the tensor collapses to a point.
+This function returns a pair of values.
+1) The (new) index of the preserved dimension if excludeDim is
+specified. 0 if the tensor is collapsed to a point. -1
+otherwise.
+2) The new number of dimensions.
+*/
+template <typename T>
+inline std::pair<int64_t, int64_t> collapse_dims(
+    T* sizes,
+    T* strides,
+    int64_t dims,
+    const int excludeDim = -1) {
+  TORCH_CHECK(
+      excludeDim >= -1 && excludeDim < dims,
+      "expected excluded dim between -1 and dims - 1");
+  int64_t stopDim = (excludeDim == -1) ? dims : excludeDim;
+  int64_t newIndex = -1;
+  int64_t oldIndex = 0;
+  int64_t remappedExcludedDim = -1;
+  while (oldIndex < dims) {
+    // Finds a dimension to collapse into
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      ++oldIndex;
+      break;
+    }
+    // Collapses dims
+    for (; oldIndex < stopDim; ++oldIndex) {
+      if (sizes[oldIndex] == 1) {
+        continue;
+      }
+      if (strides[newIndex] == sizes[oldIndex] * strides[oldIndex]) {
+        sizes[newIndex] *= sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      } else {
+        ++newIndex;
+        sizes[newIndex] = sizes[oldIndex];
+        strides[newIndex] = strides[oldIndex];
+      }
+    }
+    // Handles excludeDim being set (oldIndex == excludeDim)
+    if (oldIndex != dims) {
+      // Preserves excluded dimension
+      ++newIndex;
+      sizes[newIndex] = sizes[oldIndex];
+      strides[newIndex] = strides[oldIndex];
+      remappedExcludedDim = newIndex;
+      // Restarts iteration after excludeDim
+      ++oldIndex;
+      stopDim = dims;
+    }
+  }
+  // Handles special case of all dims size 1
+  if (newIndex == -1 || (newIndex == 0 && sizes[0] == 1)) {
+    dims = 1;
+    sizes[0] = 1;
+    strides[0] = 1;
+    return std::pair<int64_t, int64_t>(0, 1);
+  }
+  dims = newIndex + 1;
+  return std::pair<int64_t, int64_t>(remappedExcludedDim, dims);
+}
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/CompositeImplicitAutogradNestedTensorFunctions_inl.h ADDED Viewed

	@@ -0,0 +1,30 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_compositeimplicitautogradnestedtensor_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+#include <ATen/ops/randn_like_compositeimplicitautogradnestedtensor_dispatch.h>
+#include <ATen/ops/reshape_compositeimplicitautogradnestedtensor_dispatch.h>
+#include <ATen/ops/reshape_as_compositeimplicitautogradnestedtensor_dispatch.h>
+#include <ATen/ops/zeros_like_compositeimplicitautogradnestedtensor_dispatch.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Config.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// Test these using #if AT_MKL_ENABLED(), not #ifdef, so that it's
+// obvious if you forgot to include Config.h
+//    c.f. https://stackoverflow.com/questions/33759787/generating-an-error-if-checked-boolean-macro-is-not-defined
+//
+// DO NOT put the macros for CUDA libraries in this file; they belong in cuda/CUDAConfig.h
+#define AT_MKLDNN_ENABLED() 1
+#define AT_MKLDNN_ACL_ENABLED() 0
+#define AT_MKL_ENABLED() 1
+#define AT_MKL_SEQUENTIAL() 0
+#define AT_POCKETFFT_ENABLED() 0
+#define AT_NNPACK_ENABLED() 1
+#define CAFFE2_STATIC_LINK_CUDA() 0
+#define AT_BUILD_WITH_BLAS() 1
+#define AT_BUILD_WITH_LAPACK() 1
+#define AT_PARALLEL_OPENMP 1
+#define AT_PARALLEL_NATIVE 0
+#define AT_BLAS_F2C() 0
+#define AT_BLAS_USE_CBLAS_DOT() 0
+#define AT_KLEIDIAI_ENABLED() 0
+#define AT_USE_EIGEN_SPARSE() 0
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Device.h ADDED Viewed

	@@ -0,0 +1,7 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/Device.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/DeviceAccelerator.h ADDED Viewed

	@@ -0,0 +1,118 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/CachingDeviceAllocator.h>
+#include <c10/core/DeviceCapability.h>
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Macros.h>
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <optional>
+namespace at::accelerator {
+// Note [Accelerator Concept]
+// This file defines the top level Accelerator concept for PyTorch.
+// A device is an accelerator per the definition here if:
+// - It is mutually exclusive with all other accelerators
+// - It performs asynchronous compute via a Stream/Event system
+// - It provides a set of common APIs as defined by AcceleratorHooksInterface
+//
+// As of today, accelerator devices are (in no particular order):
+// CUDA, MTIA, XPU, HIP, MPS, PrivateUse1
+// Ensures that only one accelerator is available (at
+// compile time if possible) and return it.
+// When checked is true, the returned optional always has a value.
+TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
+// Check if the given device type is an accelerator.
+TORCH_API bool isAccelerator(c10::DeviceType device_type);
+// Check if the given device type is an accelerator, not the excluded ones.
+template <
+    typename... T,
+    typename = std::enable_if_t<(std::is_same_v<T, c10::DeviceType> && ...)>>
+inline bool isAcceleratorExcluded(
+    c10::DeviceType device_type,
+    c10::DeviceType first_excluded,
+    T... rest_excluded) {
+  if constexpr (sizeof...(rest_excluded) > 0) {
+    return device_type != first_excluded &&
+        isAcceleratorExcluded(device_type, rest_excluded...);
+  } else {
+    return device_type != first_excluded && isAccelerator(device_type);
+  }
+}
+// Return the number of the device available. Note that this is *REQUIRED* to
+// not raise any exception.
+TORCH_API c10::DeviceIndex deviceCount();
+// Set the current device index to the given device index.
+TORCH_API void setDeviceIndex(c10::DeviceIndex device_index);
+// Get the current device index.
+TORCH_API c10::DeviceIndex getDeviceIndex();
+// Set the current stream to a given stream. Note that this API doesn't change
+// the current device index.
+TORCH_API void setCurrentStream(c10::Stream stream);
+// Get the current stream of the given device index.
+TORCH_API c10::Stream getCurrentStream(c10::DeviceIndex device_index);
+// Wait (by blocking the calling thread) until all the work previously enqueued
+// on the given device index has been completed.
+TORCH_API void synchronizeDevice(c10::DeviceIndex device_index);
+// Set the current device index to the given device_index and return the
+// original device index that was active before the change.
+TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
+// Set the current device index to the given device_index. Avoid creating a new
+// context if the context for device_index is not initialized. Return the
+// original device index that was active before the change.
+TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);
+// Get the device capability of the given device index.
+TORCH_API c10::DeviceCapability getDeviceCapability(
+    c10::DeviceIndex device_index);
+TORCH_API inline void emptyCache() {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->emptyCache();
+}
+TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
+    c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
+}
+TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
+}
+TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
+}
+TORCH_API inline std::pair<size_t, size_t> getMemoryInfo(
+    c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index);
+}
+} // namespace at::accelerator
+namespace at {
+// Keep BC only
+using at::accelerator::getAccelerator;
+using at::accelerator::isAccelerator;
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/DimVector.h ADDED Viewed

	@@ -0,0 +1,7 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/DimVector.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Dispatch_v2.h ADDED Viewed

	@@ -0,0 +1,182 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <torch/headeronly/core/Dispatch_v2.h>
+// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE:
+#include <ATen/Dispatch.h>
+// This is a new implementation of the AT_DISPATCH macro family from
+// ATen/Dispatch.h
+//
+// The intended usage is:
+//
+//  ScalarType scalar_type;
+//
+//  AT_DISPATCH_V2(
+//    scalar_type,
+//    "debug string",
+//    AT_WRAP([&] {
+//      ... code to specialize with scalar_t ...
+//    }),
+//    kHalf,
+//    AT_EXPAND(AT_ALL_TYPES),
+//    ... as many types arguments as needed ...
+//  )
+//
+// For example, given an old style:
+//
+//  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
+//    kComplexHalf,
+//    kHalf,
+//    self.scalar_type(),
+//    "_local_scalar_dense_cpu",
+//    [&] {
+//      scalar_t value = *self.data_ptr<scalar_t>();
+//      r = Scalar(value);
+//    }
+//  )
+//
+// You now write:
+//
+//  AT_DISPATCH_V2(
+//    self.scalar_type(),
+//    "_local_scalar_dense_cpu",
+//    AT_WRAP([&] {
+//      scalar_t value = *self.data_ptr<scalar_t>();
+//      r = Scalar(value);
+//    }),
+//    AT_EXPAND(AT_ALL_TYPES),
+//    AT_EXPAND(AT_COMPLEX_TYPES),
+//    kComplexHalf,
+//    kHalf,
+//  )
+//
+// Notably, it sports the following improvements:
+//
+//  - It is not necessary to specify the arity (e.g.,
+//    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND{2,3,4,...})
+//    when using the macro
+//
+//  - It is not necessary to specify each dtype individually; if
+//    there is a set of related dtypes and you want to dispatch
+//    over all of them, you can simply say, e.g., AT_EXPAND(AT_INTEGRAL_TYPES)
+//    in your argument list.
+//
+// However, you must remember to wrap the payload body in AT_WRAP, or commas
+// inside your lambda will be improperly handled.  Furthermore, if you more
+// entries to ScalarType than can be supported by this macro, it will fail
+// with an obscure error (due to attempting to concatenate AT_AP with
+// something that is not a number).
+//
+// The implementation strategy is to use the count arguments trick
+// (e.g., as described in https://stackoverflow.com/a/2124385/23845)
+// to discover how many dtypes have been passed, and then dispatch to a
+// hand-written macro for each arity that applies as many DISPATCH_CASE as
+// necessary.  The hand-written macros can be regenerated for other arities
+// with the script below.
+//
+// There is some delicacy in the implementation in controlling when
+// macro expansion occurs, mediated with AT_EXPAND and AT_GUARD.  I mostly
+// relied on GPT4 to help me get it right.
+// See documentation above
+#define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
+  THO_DISPATCH_V2_TMPL(                       \
+      AT_DISPATCH_SWITCH,                     \
+      AT_DISPATCH_CASE,                       \
+      TYPE,                                   \
+      NAME,                                   \
+      AT_WRAP(BODY),                          \
+      __VA_ARGS__)
+// Unused helper macros, kept for BC:
+#define AT_AP_VAR(N, T, ...) \
+  AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
+// Ensure we never have too many scalar types for the expansion here to
+// support.  To bump this, you must regenerate the macros below.
+static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 60);
+// Python code to regenerate generate code below:
+#if 0
+num_args = 60
+for i in range(1, num_args+1):
+    args = ', '.join(f'_{i}' for i in range(1, i+1))
+    cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
+    print(f'#define AT_AP{i}(N, {args}) {cases}')
+#endif
+// Begin generated code
+// clang-format off
+#define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
+#define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
+#define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
+#define AT_AP4(N, _1, _2, _3, _4) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N)
+#define AT_AP5(N, _1, _2, _3, _4, _5) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N)
+#define AT_AP6(N, _1, _2, _3, _4, _5, _6) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N)
+#define AT_AP7(N, _1, _2, _3, _4, _5, _6, _7) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N)
+#define AT_AP8(N, _1, _2, _3, _4, _5, _6, _7, _8) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N)
+#define AT_AP9(N, _1, _2, _3, _4, _5, _6, _7, _8, _9) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N)
+#define AT_AP10(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N)
+#define AT_AP11(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N)
+#define AT_AP12(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N)
+#define AT_AP13(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N)
+#define AT_AP14(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N)
+#define AT_AP15(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N)
+#define AT_AP16(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N)
+#define AT_AP17(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N)
+#define AT_AP18(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N)
+#define AT_AP19(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N)
+#define AT_AP20(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N)
+#define AT_AP21(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N)
+#define AT_AP22(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N)
+#define AT_AP23(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N)
+#define AT_AP24(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N)
+#define AT_AP25(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N)
+#define AT_AP26(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N)
+#define AT_AP27(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N)
+#define AT_AP28(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N)
+#define AT_AP29(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N)
+#define AT_AP30(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N)
+#define AT_AP31(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N)
+#define AT_AP32(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N)
+#define AT_AP33(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N)
+#define AT_AP34(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N)
+#define AT_AP35(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N)
+#define AT_AP36(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N)
+#define AT_AP37(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N)
+#define AT_AP38(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N)
+#define AT_AP39(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N)
+#define AT_AP40(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N)
+#define AT_AP41(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N)
+#define AT_AP42(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N)
+#define AT_AP43(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N)
+#define AT_AP44(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N)
+#define AT_AP45(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N)
+#define AT_AP46(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N)
+#define AT_AP47(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N)
+#define AT_AP48(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N)
+#define AT_AP49(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N)
+#define AT_AP50(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N)
+#define AT_AP51(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N)
+#define AT_AP52(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N) AT_DISPATCH_CASE(_52, N)
+#define AT_AP53(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N) AT_DISPATCH_CASE(_52, N) AT_DISPATCH_CASE(_53, N)
+#define AT_AP54(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N) AT_DISPATCH_CASE(_52, N) AT_DISPATCH_CASE(_53, N) AT_DISPATCH_CASE(_54, N)
+#define AT_AP55(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N) AT_DISPATCH_CASE(_52, N) AT_DISPATCH_CASE(_53, N) AT_DISPATCH_CASE(_54, N) AT_DISPATCH_CASE(_55, N)
+#define AT_AP56(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N) AT_DISPATCH_CASE(_52, N) AT_DISPATCH_CASE(_53, N) AT_DISPATCH_CASE(_54, N) AT_DISPATCH_CASE(_55, N) AT_DISPATCH_CASE(_56, N)
+#define AT_AP57(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N) AT_DISPATCH_CASE(_52, N) AT_DISPATCH_CASE(_53, N) AT_DISPATCH_CASE(_54, N) AT_DISPATCH_CASE(_55, N) AT_DISPATCH_CASE(_56, N) AT_DISPATCH_CASE(_57, N)
+#define AT_AP58(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N) AT_DISPATCH_CASE(_52, N) AT_DISPATCH_CASE(_53, N) AT_DISPATCH_CASE(_54, N) AT_DISPATCH_CASE(_55, N) AT_DISPATCH_CASE(_56, N) AT_DISPATCH_CASE(_57, N) AT_DISPATCH_CASE(_58, N)
+#define AT_AP59(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N) AT_DISPATCH_CASE(_52, N) AT_DISPATCH_CASE(_53, N) AT_DISPATCH_CASE(_54, N) AT_DISPATCH_CASE(_55, N) AT_DISPATCH_CASE(_56, N) AT_DISPATCH_CASE(_57, N) AT_DISPATCH_CASE(_58, N) AT_DISPATCH_CASE(_59, N)
+#define AT_AP60(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) AT_DISPATCH_CASE(_46, N) AT_DISPATCH_CASE(_47, N) AT_DISPATCH_CASE(_48, N) AT_DISPATCH_CASE(_49, N) AT_DISPATCH_CASE(_50, N) AT_DISPATCH_CASE(_51, N) AT_DISPATCH_CASE(_52, N) AT_DISPATCH_CASE(_53, N) AT_DISPATCH_CASE(_54, N) AT_DISPATCH_CASE(_55, N) AT_DISPATCH_CASE(_56, N) AT_DISPATCH_CASE(_57, N) AT_DISPATCH_CASE(_58, N) AT_DISPATCH_CASE(_59, N) AT_DISPATCH_CASE(_60, N)
+// End generated code
+// clang-format on
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/DynamicLibrary.h ADDED Viewed

	@@ -0,0 +1,41 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/Utils.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+namespace c10 {
+class DynamicLibraryError : public Error {
+  using Error::Error;
+};
+} // namespace c10
+namespace at {
+struct DynamicLibrary {
+  AT_DISALLOW_COPY_AND_ASSIGN(DynamicLibrary);
+  DynamicLibrary(DynamicLibrary&& other) = delete;
+  DynamicLibrary& operator=(DynamicLibrary&&) = delete;
+  TORCH_API DynamicLibrary(
+      const char* name,
+      const char* alt_name = nullptr,
+      bool leak_handle = false);
+  TORCH_API void* sym(const char* name);
+  TORCH_API ~DynamicLibrary();
+ private:
+  bool leak_handle;
+  void* handle = nullptr;
+};
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/EmptyTensor.h ADDED Viewed

	@@ -0,0 +1,171 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/TensorBase.h>
+namespace at::detail {
+inline void check_size_nonnegative(ArrayRef<int64_t> size) {
+  for (const auto& x : size) {
+    TORCH_CHECK(
+        x >= 0,
+        "Trying to create tensor with negative dimension ",
+        x,
+        ": ",
+        size);
+  }
+}
+inline void check_size_nonnegative(ArrayRef<c10::SymInt> size) {
+  for (const auto& x : size) {
+    TORCH_SYM_CHECK(
+        x.sym_ge(0),
+        "Trying to create tensor with negative dimension ",
+        x,
+        ": ",
+        size);
+  }
+}
+TORCH_API size_t computeStorageNbytesContiguous(
+    IntArrayRef sizes,
+    size_t itemsize,
+    size_t storage_offset = 0);
+TORCH_API SymInt computeStorageNbytesContiguous(
+    SymIntArrayRef sizes,
+    const SymInt& itemsize,
+    const SymInt& storage_offset = 0);
+TORCH_API size_t computeStorageNbytes(
+    IntArrayRef sizes,
+    IntArrayRef strides,
+    size_t itemsize,
+    size_t storage_offset = 0);
+TORCH_API SymInt computeStorageNbytes(
+    SymIntArrayRef sizes,
+    SymIntArrayRef strides,
+    const SymInt& itemsize,
+    const SymInt& storage_offset = 0);
+TORCH_API TensorBase empty_generic(
+    IntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    std::optional<c10::MemoryFormat> memory_format_opt);
+TORCH_API TensorBase empty_generic_symint(
+    SymIntArrayRef size,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type,
+    std::optional<c10::MemoryFormat> memory_format_opt);
+TORCH_API TensorBase empty_strided_generic(
+    IntArrayRef size,
+    IntArrayRef stride,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type);
+TORCH_API TensorBase empty_strided_symint_generic(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    c10::Allocator* allocator,
+    c10::DispatchKeySet ks,
+    ScalarType scalar_type);
+TORCH_API TensorBase empty_cpu(
+    IntArrayRef size,
+    ScalarType dtype,
+    bool pin_memory = false,
+    std::optional<c10::MemoryFormat> memory_format_opt = std::nullopt);
+TORCH_API TensorBase empty_cpu(
+    IntArrayRef size,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
+TORCH_API TensorBase empty_cpu(IntArrayRef size, const TensorOptions& options);
+TORCH_API TensorBase empty_strided_cpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    bool pin_memory = false);
+TORCH_API TensorBase empty_strided_cpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt);
+TORCH_API TensorBase empty_strided_cpu(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options);
+TORCH_API TensorBase empty_meta(
+    IntArrayRef size,
+    ScalarType dtype,
+    std::optional<c10::MemoryFormat> memory_format_opt = std::nullopt);
+TORCH_API TensorBase empty_meta(
+    IntArrayRef size,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
+TORCH_API TensorBase empty_symint_meta(
+    SymIntArrayRef size,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
+TORCH_API TensorBase empty_meta(IntArrayRef size, const TensorOptions& options);
+TORCH_API TensorBase
+empty_strided_meta(IntArrayRef size, IntArrayRef stride, ScalarType dtype);
+TORCH_API TensorBase empty_strided_meta(
+    IntArrayRef size,
+    IntArrayRef stride,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt);
+TORCH_API TensorBase empty_strided_meta(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions& options);
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    ScalarType dtype);
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt);
+TORCH_API TensorBase empty_strided_symint_meta(
+    SymIntArrayRef size,
+    SymIntArrayRef stride,
+    const TensorOptions& options);
+} // namespace at::detail
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ExpandUtils.h ADDED Viewed

	@@ -0,0 +1,540 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/view.h>
+#include <ATen/ops/view_copy.h>
+#endif
+#include <ATen/Tensor.h>
+#include <ATen/core/DimVector.h>
+#include <c10/util/Exception.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/irange.h>
+#include <functional>
+#include <tuple>
+#include <utility>
+namespace at {
+TORCH_API std::vector<int64_t> infer_size(IntArrayRef a, IntArrayRef b);
+TORCH_API std::vector<SymInt> infer_size_symint(
+    SymIntArrayRef a,
+    SymIntArrayRef b);
+TORCH_API DimVector infer_size_dimvector(IntArrayRef a, IntArrayRef b);
+TORCH_API SymDimVector
+infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b);
+// Named type instead of a pair/tuple so that we can be sure to
+// construct the vectors in place and get NRVO.
+template <typename Container>
+struct InferExpandGeometryResult {
+  Container sizes;
+  Container strides;
+  explicit InferExpandGeometryResult(size_t ndim)
+      : sizes(ndim), strides(ndim) {}
+  explicit InferExpandGeometryResult(IntArrayRef sizes_, size_t ndim)
+      : sizes(sizes_.begin(), sizes_.end()), strides(ndim) {}
+};
+TORCH_API std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+inferExpandGeometry(
+    IntArrayRef tensor_sizes,
+    IntArrayRef tensor_strides,
+    IntArrayRef sizes);
+TORCH_API InferExpandGeometryResult<DimVector> inferExpandGeometry_dimvector(
+    IntArrayRef tensor_sizes,
+    IntArrayRef tensor_strides,
+    IntArrayRef sizes);
+TORCH_API std::vector<int64_t> infer_dense_strides(
+    IntArrayRef tensor_sizes,
+    IntArrayRef tensor_strides);
+// True if input shapes are expandable
+// NOTE: infer_size did a similar check, please keep them sync if change is
+// needed
+inline bool are_expandable(IntArrayRef shape1, IntArrayRef shape2) {
+  size_t ndim1 = shape1.size();
+  size_t ndim2 = shape2.size();
+  size_t ndim = ndim1 < ndim2 ? ndim1 : ndim2;
+  for (int64_t i = static_cast<int64_t>(ndim) - 1; i >= 0; --i) {
+    if (shape1[--ndim1] == shape2[--ndim2] || shape1[ndim1] == 1 ||
+        shape2[ndim2] == 1) {
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+// avoid copy-construction of Tensor by using a reference_wrapper.
+inline void check_defined(
+    std::initializer_list<std::reference_wrapper<const Tensor>> tensors,
+    const char* api_name) {
+  for (auto& t : tensors) {
+    if (!t.get().defined()) {
+      TORCH_CHECK(false, api_name, "(...) called with an undefined Tensor");
+    }
+  }
+}
+// NOTE [ ExpandUtils Borrowing ]
+//
+// Functions in ExpandUtils return `c10::MaybeOwned<Tensor>` because
+// expansion may not actually be needed, in which case we can improve
+// efficiency by returning
+// `c10::MaybeOwned<Tensor>::borrowed(to_expand)`. However, this means
+// that you need to be careful: the returned `c10::MaybeOwned<Tensor>`
+// must not outlive the original `Tensor` object that `to_expand`
+// referred to! The deleted rvalue reference overloads of these
+// functions help with this by preventing trivial use of a temporary
+// resulting from a function call, but it is still possible to make a
+// mistake.
+inline c10::MaybeOwned<Tensor> expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand) {
+  if (tensor.sym_sizes().equals(to_expand.sym_sizes())) {
+    return c10::MaybeOwned<Tensor>::borrowed(to_expand);
+  }
+  return c10::MaybeOwned<Tensor>::owned(
+      to_expand.expand_symint(tensor.sym_sizes()));
+}
+inline c10::MaybeOwned<Tensor> expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand) = delete;
+inline c10::MaybeOwned<Tensor> expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand,
+    const char* api_name) {
+  check_defined({tensor, to_expand}, api_name);
+  return expand_inplace(tensor, to_expand);
+}
+inline c10::MaybeOwned<Tensor> expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand,
+    const char* api_name) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand1,
+    const Tensor& to_expand2) {
+  if (tensor.sizes().equals(to_expand1.sizes()) &&
+      tensor.sizes().equals((to_expand2.sizes()))) {
+    return std::make_tuple(
+        c10::MaybeOwned<Tensor>::borrowed(to_expand1),
+        c10::MaybeOwned<Tensor>::borrowed(to_expand2));
+  }
+  return std::make_tuple(
+      c10::MaybeOwned<Tensor>::owned(to_expand1.expand(tensor.sizes())),
+      c10::MaybeOwned<Tensor>::owned(to_expand2.expand(tensor.sizes())));
+}
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand1,
+    const Tensor& to_expand2) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand1,
+    Tensor&& to_expand2) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(const Tensor& tensor, Tensor&& to_expand1, Tensor&& to_expand2) =
+    delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    const char* api_name) {
+  check_defined({tensor, to_expand1, to_expand2}, api_name);
+  return expand_inplace(tensor, to_expand1, to_expand2);
+}
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    const char* api_name) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    const char* api_name) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_inplace(
+    const Tensor& tensor,
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    const char* api_name) = delete;
+// See NOTE [ ExpandUtils Borrowing ] above for `MaybeOwned` explanation.
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(const Tensor& to_expand1, const Tensor& to_expand2) {
+  auto s1 = to_expand1.sym_sizes();
+  auto s2 = to_expand2.sym_sizes();
+  if (s1.equals(s2)) {
+    return std::make_tuple(
+        c10::MaybeOwned<Tensor>::borrowed(to_expand1),
+        c10::MaybeOwned<Tensor>::borrowed(to_expand2));
+  }
+  auto expanded_size = infer_size_symdimvector(s1, s2);
+  return std::make_tuple(
+      c10::MaybeOwned<Tensor>::owned(to_expand1.expand_symint(expanded_size)),
+      c10::MaybeOwned<Tensor>::owned(to_expand2.expand_symint(expanded_size)));
+}
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(Tensor&& to_expand1, const Tensor& to_expand2) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(const Tensor& to_expand1, Tensor&& to_expand2) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(Tensor&& to_expand1, Tensor&& to_expand2) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    const char* api_name) {
+  check_defined({to_expand1, to_expand2}, api_name);
+  return expand_outplace(to_expand1, to_expand2);
+}
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    const char* api_name) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    const char* api_name) = delete;
+inline std::tuple<c10::MaybeOwned<Tensor>, c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    const Tensor& to_expand3) {
+  if (to_expand1.sizes().equals(to_expand2.sizes()) &&
+      to_expand1.sizes().equals(to_expand3.sizes())) {
+    return std::make_tuple(
+        c10::MaybeOwned<Tensor>::borrowed(to_expand1),
+        c10::MaybeOwned<Tensor>::borrowed(to_expand2),
+        c10::MaybeOwned<Tensor>::borrowed(to_expand3));
+  }
+  auto expanded_size12 =
+      infer_size_dimvector(to_expand1.sizes(), to_expand2.sizes());
+  auto expanded_size =
+      infer_size_dimvector(expanded_size12, to_expand3.sizes());
+  return std::make_tuple(
+      c10::MaybeOwned<Tensor>::owned(to_expand1.expand(expanded_size)),
+      c10::MaybeOwned<Tensor>::owned(to_expand2.expand(expanded_size)),
+      c10::MaybeOwned<Tensor>::owned(to_expand3.expand(expanded_size)));
+}
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    const Tensor& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    const Tensor& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    const Tensor& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    Tensor&& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    Tensor&& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    Tensor&& to_expand3) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(Tensor&& to_expand1, Tensor&& to_expand2, Tensor&& to_expand3) =
+    delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    const Tensor& to_expand3,
+    const char* api_name) {
+  check_defined({to_expand1, to_expand2, to_expand3}, api_name);
+  return expand_outplace(to_expand1, to_expand2, to_expand3);
+}
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    const Tensor& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    const Tensor& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    const Tensor& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    const Tensor& to_expand2,
+    Tensor&& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    const Tensor& to_expand2,
+    Tensor&& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    const Tensor& to_expand1,
+    Tensor&& to_expand2,
+    Tensor&& to_expand3,
+    const char* api_name) = delete;
+inline std::tuple<
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>,
+    c10::MaybeOwned<Tensor>>
+expand_outplace(
+    Tensor&& to_expand1,
+    Tensor&& to_expand2,
+    Tensor&& to_expand3,
+    const char* api_name) = delete;
+inline c10::MaybeOwned<Tensor> expand_size(
+    const Tensor& to_expand,
+    IntArrayRef sizes) {
+  if (to_expand.sizes().equals(sizes)) {
+    return c10::MaybeOwned<Tensor>::borrowed(to_expand);
+  }
+  return c10::MaybeOwned<Tensor>::owned(to_expand.expand(sizes));
+}
+inline c10::MaybeOwned<Tensor> expand_size(
+    Tensor&& to_expand,
+    IntArrayRef sizes) = delete;
+inline c10::MaybeOwned<Tensor> expand_size(
+    const Tensor& to_expand,
+    IntArrayRef sizes,
+    const char* api_name) {
+  check_defined({to_expand}, api_name);
+  return expand_size(to_expand, sizes);
+}
+inline c10::MaybeOwned<Tensor> expand_size(
+    Tensor&& to_expand,
+    IntArrayRef sizes,
+    const char* api_name) = delete;
+inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
+  // expands a list of Tensors; ignores undefined (null) tensors
+  bool first = true;
+  SymDimVector sizes;
+  for (const auto i : c10::irange(to_expand.size())) {
+    if (!to_expand[i].defined()) {
+      continue;
+    } else if (first) {
+      sizes = to_expand[i].sym_sizes();
+      first = false;
+    } else {
+      sizes = infer_size_symdimvector(sizes, to_expand[i].sym_sizes());
+    }
+  }
+  std::vector<Tensor> result(to_expand.size());
+  for (const auto i : c10::irange(to_expand.size())) {
+    if (!to_expand[i].defined()) {
+      continue;
+    } else if (to_expand[i].sym_sizes().equals(sizes)) {
+      result[i] = to_expand[i];
+    } else {
+      result[i] = to_expand[i].expand_symint(sizes);
+    }
+  }
+  return result;
+}
+template <typename T>
+inline Tensor _sum_to(
+    Tensor tensor,
+    const c10::ArrayRef<T> shape,
+    bool always_return_non_view = false) {
+  if (shape.size() == 0) {
+    return tensor.sum();
+  }
+  auto sizes = at::symint::sizes<T>(tensor);
+  c10::SmallVector<int64_t, 8> reduce_dims;
+  const int64_t leading_dims = sizes.size() - shape.size();
+  for (const auto i : c10::irange(leading_dims)) {
+    reduce_dims.push_back(i);
+  }
+  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
+    if (TORCH_GUARD_OR_FALSE(sym_eq(shape[i - leading_dims], 1)) &&
+        TORCH_GUARD_OR_TRUE(sym_ne(sizes[i], 1))) {
+      reduce_dims.push_back(i);
+    } else {
+      // if we assume no reduction due to unbacked we ensure that at runtime.
+      TORCH_MAYBE_SYM_CHECK(
+          sym_eq(shape[i - leading_dims], sizes[i]),
+          "non-reduction path was assumed due to unbacked symbols expected those two sizes to be the same:",
+          shape[i - leading_dims],
+          ", ",
+          sizes[i])
+    }
+  }
+  if (!reduce_dims.empty()) {
+    tensor = tensor.sum(reduce_dims, /*keepdim=*/true);
+  }
+  if (always_return_non_view) {
+    // This is only actually used by the functionalization pass.
+    // We want to be able to guarantee that this function doesn't return a view
+    // of the input.
+    return leading_dims > 0 ? at::symint::view_copy<T>(tensor, shape)
+                            : tensor.clone();
+  } else {
+    return leading_dims > 0 ? at::symint::view<T>(tensor, shape) : tensor;
+  }
+}
+inline Tensor sum_to(
+    Tensor tensor,
+    const c10::SymIntArrayRef shape,
+    bool always_return_non_view = false) {
+  return _sum_to(std::move(tensor), shape, always_return_non_view);
+}
+// Sums `tensor` repeatedly to produce a tensor of shape `shape`.
+// Precondition: is_expandable_to(shape, tensor.sizes()) must be true
+inline Tensor sum_to(
+    Tensor tensor,
+    const IntArrayRef shape,
+    bool always_return_non_view = false) {
+  return _sum_to(std::move(tensor), shape, always_return_non_view);
+}
+inline bool is_expandable_to(
+    SymIntArrayRef shape,
+    c10::SymIntArrayRef desired) {
+  size_t ndim = shape.size();
+  size_t target_dim = desired.size();
+  if (ndim > target_dim) {
+    return false;
+  }
+  for (const auto i : c10::irange(ndim)) {
+    const auto& size = shape[ndim - i - 1];
+    const auto& target = desired[target_dim - i - 1];
+    if (size != target && size != 1) {
+      return false;
+    }
+  }
+  return true;
+}
+inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
+  auto sym_shape = c10::SymIntArrayRef(
+      reinterpret_cast<const c10::SymInt*>(shape.data()), shape.size());
+  auto sym_desired = c10::SymIntArrayRef(
+      reinterpret_cast<const c10::SymInt*>(desired.data()), desired.size());
+  return is_expandable_to(sym_shape, sym_desired);
+}
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/FunctionalTensorWrapper.h ADDED Viewed

	@@ -0,0 +1,476 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/ArrayRef.h>
+#include <ATen/FunctionalStorageImpl.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/core/List.h>
+#include <ATen/core/boxing/BoxedKernel.h>
+#include <ATen/core/boxing/impl/boxing.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/core/DispatchKey.h>
+namespace at {
+// Note [Functionalization Pass In Core]
+// The Functionalization pass is used to remove aliasing from a pytorch program.
+//
+// This is useful for backends that don't support aliasing, like XLA and Vulkan.
+// It's also necessary in order to remove mutation from a program, which is
+// needed in Functorch.
+//
+// Consider this program:
+// a = torch.ones(...)
+// b = a.view(...)
+// b.add_(1)
+//
+// In this program, b is meant to alias with a due to the use of view(). At the
+// end of the program, both a and b are full of 2's. However, backends that
+// don't support aliasing aren't able to correctly implement the view()
+// operator. Instead, they can opt into the Functionalization pass, which will
+// sit between the user and the backend, and provide the necessary aliasing
+// logic.
+//
+// The functionalization pass will turn the above program into a slightly
+// different program that has the same semantics, transparently to the user,
+// that backends like XLA/Vulkan are able to implement a = torch.ones(...) b =
+// a.view_copy(...)  # view() replaced with view_copy(). Backends like
+// XLA/Vulkan can implement this! b.add_(1) a.add_(1)  # Our functionalization
+// pass machinery knows that a and b are aliased - it applies b's mutation to a
+// too.
+//
+// So, how does the functionalization pass keep track of which tensors are
+// aliased? The pass works by wrapping EVERY tensor in the program inside of a
+// FunctionalTensorWrapper, which knows about its alias'd tensors.
+//
+// See Note [Functionalization: Alias Removal] for details on the aliasing
+// machinery. See Note [Functionalization: Mutation Removal] for details on
+// mutation removal.
+struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
+  explicit FunctionalTensorWrapper(const Tensor& value);
+  // Additional constructor to create a FunctionalTensorWrapper directly from an
+  // underlying tensor that was created from a view. For example, the code b =
+  // a.view1() will generate a constructor call to FunctionalTensorWrapper(b, a,
+  // view1_meta)
+  explicit FunctionalTensorWrapper(
+      const Tensor& view_value,
+      const FunctionalTensorWrapper* base,
+      const std::shared_ptr<functionalization::ViewMeta>& meta);
+  // Get the underlying, actual tensor, that doesn't know anything about
+  // functionalization.
+  const Tensor& value() const {
+    return value_;
+  }
+  // The concept of "level" is only ever important to functorch; it's exposed
+  // here as more of a hook for functorch to use.
+  int64_t level() const {
+    return level_;
+  }
+  void set_level(int64_t level) {
+    level_ = level;
+  }
+  bool has_metadata_mutation() const {
+    return has_metadata_mutation_;
+  }
+  uint64_t mutation_counter() const {
+    return functional_storage_impl()->mutation_counter();
+  }
+  void mark_mutation() {
+    functional_storage_impl()->mark_mutation();
+  }
+  // Denotes a mutation that's hidden from autograd,
+  // e.g. for the purposes of passing a tensor to a triton kernel
+  void mark_mutation_hidden_from_autograd() {
+    functional_storage_impl()->mark_mutation_hidden_from_autograd();
+  }
+  void mark_mutation_during_no_grad_or_inference_mode() {
+    functional_storage_impl()->mark_mutation_during_no_grad_or_inference_mode();
+  }
+  // Are all the mutations happening to the tensor hidden from autograd
+  bool are_all_mutations_hidden_from_autograd() const {
+    return functional_storage_impl()->are_all_mutations_hidden_from_autograd();
+  }
+  // Did all mutations happen under no_grad or inference_mode
+  // (We also need to ignore mutations fully hidden from autograd here)
+  bool are_all_mutations_under_no_grad_or_inference_mode() const {
+    return functional_storage_impl()
+        ->are_all_mutations_under_no_grad_or_inference_mode();
+  }
+  void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
+    is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
+  }
+  bool is_symbolic() const {
+    return is_symbolic_;
+  }
+  // Retrieves the ViewMeta sequence of this tensor.
+  const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
+      const;
+  // Sync's the underlying tensor with its alias, if it's out of date. This
+  // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
+  // Replay the views (if any) to regenerate the current tensor off of the
+  // updated alias.
+  void sync_();
+  // Performs step (1) of the sync. This is its own public API because it's
+  // needed by view_inplace ops like transpose_. See Note [Functionalization
+  // Pass - Inplace View Ops]
+  void regenerate_from_base();
+  // Performs step (2) of the sync. This is its own public API because it's
+  // needed by functorch. functorch wants to make sure that all input tensors to
+  // a functionalized program have been properly synced so it can properly
+  // propagate mutations to inputs. It can't just call sync_(), because the
+  // FunctionalTensorWrapper will look like it has no aliases and sync_ will be
+  // a noop. We use the reference count on storage_ to determine if the wrapper
+  // is aliased, and by the time functorch is ready to propagate updates to
+  // inputs, any intermediate views of the input created by the program will
+  // have been deallocated. This function also returns whether or not the base
+  // actually had any updates to apply.
+  bool apply_updates();
+  // Takes the current state of value_ and snapshots it, sending it as a pending
+  // update to the alias.
+  void commit_update();
+  // When any tensor is mutated, the tensor increments its alias's "generation".
+  // Separately, each tensor maintains its own "generation" counter, which is
+  // used to determine if it's up-to-date with its alias. The act of syncing a
+  // tensor will set a tensor's generation equal to its alias's generation.
+  bool is_up_to_date() const;
+  // Freezes the storage of this tensor, preventing subsequent mutations
+  void freeze_storage() const;
+  // Every FunctionalTensorWrapper contains a vector<ViewMeta> objects
+  // describing the series of view ops that ran to generate the current tensor
+  // from the base tensor. This method is used by inplace-view ops like
+  // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
+  // tensor by replaying the views off of the alias.
+  void mutate_view_meta(
+      const std::shared_ptr<at::functionalization::ViewMeta>& meta);
+  // Custom implementation of self.set_(src)
+  void set__impl(const FunctionalTensorWrapper* other);
+  // Custom implementation of resize_storage_bytes_(self, new_size)
+  void storage_resize_(const c10::SymInt& new_size);
+  // Returns whether the current tensor's data was ever mutated
+  bool has_data_mutation();
+  //
+  // Returns whether the current FunctionalTensorWrapper
+  // experienced a set_() call.
+  bool was_storage_changed() {
+    return was_storage_changed_;
+  }
+  void mark_storage_changed() {
+    was_storage_changed_ = true;
+    storage_changed_counter_++;
+  }
+  uint64_t storage_changed_counter() {
+    return storage_changed_counter_;
+  }
+  // A FunctionalTensor is considered a base if its not a view of another
+  // tensor.
+  bool isBaseTensor() const {
+    return view_metas_.empty();
+  }
+  c10::SymInt get_storage_size(bool before) {
+    return functional_storage_impl()->get_storage_size(before);
+  }
+  // Returns whether the FunctionalTensor experienced an
+  // untyped_storage().resize_() call
+  bool was_inductor_storage_resized() {
+    return functional_storage_impl()->was_inductor_storage_resized();
+  }
+  bool inductor_storage_resized_counter() {
+    return functional_storage_impl()->inductor_storage_resized_counter();
+  }
+  // The functionalization pass can be used to remove mutations.
+  // It does so by replacing any mutation op with it's corresponding
+  // out-of-place op, followed by a call to replace_(). e.g:
+  //
+  // a.add_(1)
+  //
+  // will turn into:
+  //
+  // tmp = a.add(1)
+  // a.replace_(tmp)
+  //
+  // replace_() swaps out the wrapped tensor, value_, with tmp.
+  void replace_(const Tensor& other, bool from_lazy_regenerate = false);
+  bool is_multi_output_view() {
+    return is_multi_output_view_;
+  }
+  // See Note[resize_() in functionalization pass]
+  void maybe_replace_storage(const Tensor& other);
+  // Replaces the storage with a new functional storage,
+  // and clears the view_metas_ stack.
+  // WARNING: Calling this function will sever the aliasing relationship between
+  // the current FunctionalTensorWrapper and any of its outstanding aliases.
+  // Please only call if you know what you're doing.
+  void _unsafe_reset_storage();
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+  ~FunctionalTensorWrapper() override = default;
+  // FunctionalTensorWrapper overrides all custom size/stride function,
+  // so that if the inner tensor has a custom implementation
+  // we make sure to call that implementation.
+  at::IntArrayRef sizes_custom() const override;
+  at::IntArrayRef strides_custom() const override;
+  int64_t dim_custom() const override;
+  int64_t numel_custom() const override;
+  c10::SymBool sym_is_contiguous_custom(
+      at::MemoryFormat memory_format) const override;
+  c10::SymIntArrayRef sym_sizes_custom() const override;
+  c10::SymInt sym_size_custom(int64_t d) const override;
+  c10::SymIntArrayRef sym_strides_custom() const override;
+  c10::SymInt sym_storage_offset_custom() const override;
+  c10::Device device_custom() const override;
+  c10::Layout layout_impl() const override;
+ private:
+  const char* tensorimpl_type_name() const override;
+  void set_constructor_metadata();
+  functionalization::FunctionalStorageImpl* functional_storage_impl() const;
+  // This is used to re-implement shallow_copy_and_detach for
+  // FunctionalTensorWrapper. The implementation is identical, but we just need
+  // to return a subclass instead of a plain TensorImpl.
+  // TODO: maybe it's possible to arrange for that to happen automatically
+  // without an override here?
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+  void copy_tensor_metadata_and_refresh(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const;
+  // Note that value is not taken by reference: internally, the wrapper will
+  // change the value tensor that it points to over time.
+  Tensor value_;
+  int64_t level_{};
+  // These two counters are used for identifying
+  // whether all the mutations on a given tensor are hidden from autograd or
+  // not. If we have an input mutation that is hidden from autograd, then once
+  // we convert the input mutation to a copy_() we know it will be safe to hide
+  // the copy_() from autograd as well.
+  bool has_metadata_mutation_ = false;
+  bool is_multi_output_view_ = false;
+  // Did the tensor experience a set_() call.
+  bool was_storage_changed_ = false;
+  uint64_t storage_changed_counter_ = 0;
+  // Did the tensor experience any view operation with symbolic int.
+  bool is_symbolic_ = false;
+  size_t generation_ = 0;
+  std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
+ protected:
+  static void copy_tensor_metadata(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change);
+};
+// Utility functions for the functionalization pass.
+namespace functionalization {
+namespace impl {
+inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
+    const Tensor& tensor) {
+  auto functional_impl =
+      static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(functional_impl != nullptr);
+  return functional_impl;
+}
+TORCH_API bool isBaseTensor(const at::Tensor& tensor);
+TORCH_API bool isFunctionalTensor(const at::Tensor& tensor);
+TORCH_API bool isFunctionalTensor(const std::optional<Tensor>& t);
+TORCH_API bool isFunctionalTensor(
+    const c10::List<std::optional<Tensor>>& t_list);
+TORCH_API bool isFunctionalTensor(ITensorListRef list);
+TORCH_API Tensor to_functional_tensor(const Tensor& tensor);
+TORCH_API std::optional<Tensor> to_functional_tensor(
+    const std::optional<Tensor>& tensor);
+TORCH_API c10::List<std::optional<Tensor>> to_functional_tensor(
+    const c10::List<std::optional<Tensor>>& t_list);
+TORCH_API std::vector<Tensor> to_functional_tensor(ITensorListRef t_list);
+TORCH_API void freeze_functional_tensor(const Tensor& tensor);
+TORCH_API Tensor
+from_functional_tensor(const Tensor& tensor, bool assert_functional = true);
+TORCH_API std::optional<Tensor> from_functional_tensor(
+    const std::optional<Tensor>& t,
+    bool assert_functional = true);
+TORCH_API c10::List<std::optional<Tensor>> from_functional_tensor(
+    const c10::List<std::optional<Tensor>>& t_list);
+TORCH_API std::vector<Tensor> from_functional_tensor(ITensorListRef t_list);
+TORCH_API void sync(const at::Tensor& t);
+TORCH_API void sync(const std::optional<Tensor>& t);
+TORCH_API void sync(const c10::List<std::optional<Tensor>>& t_list);
+TORCH_API void sync(ITensorListRef t_list);
+TORCH_API void replace_(const Tensor& functional_tensor, const Tensor& other);
+TORCH_API void replace_(
+    const ITensorListRef functional_tensor,
+    ITensorListRef other);
+TORCH_API void commit_update(const Tensor& functional_tensor);
+TORCH_API void commit_update(ITensorListRef functional_tensor);
+TORCH_API void unsafe_reset_storage(const Tensor& functional_tensor);
+TORCH_API void mark_mutation_hidden_from_autograd(
+    const Tensor& functional_tensor);
+TORCH_API bool are_all_mutations_hidden_from_autograd(
+    const Tensor& functional_tensor);
+TORCH_API bool are_all_mutations_under_no_grad_or_inference_mode(
+    const Tensor& functional_tensor);
+// These two methods are XLA-specific logic and are no-ops
+// for the normal functionalization flow.
+TORCH_API void propagate_xla_data(
+    const Tensor& functional_tensor,
+    const Tensor& other);
+TORCH_API void propagate_xla_data(
+    const ITensorListRef functional_tensor,
+    ITensorListRef other);
+TORCH_API void propagate_xla_data_direct(
+    const Tensor& tensor,
+    const Tensor& other);
+TORCH_API void propagate_xla_data_direct(
+    const ITensorListRef tensor,
+    ITensorListRef other);
+Tensor create_functional_tensor_with_view_meta(
+    const Tensor& view_to_wrap,
+    const Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta,
+    int64_t out_idx = 0);
+std::vector<Tensor> create_functional_tensor_with_view_meta(
+    ITensorListRef view_to_wrap,
+    const Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta);
+void mutate_view_meta(
+    const Tensor& self,
+    const std::shared_ptr<functionalization::ViewMeta>& meta);
+TORCH_API Tensor apply_view_meta_sequence(
+    const Tensor& base,
+    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
+void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
+void set_sizes_strides_offset(
+    const std::vector<Tensor>& outs,
+    const std::vector<Tensor>& meta_outs);
+//  ~~~~~ TLS used in functionalization ~~~~~
+TORCH_API bool getFunctionalizationReapplyViewsTLS();
+TORCH_API void setFunctionalizationReapplyViewsTLS(bool reapply_views);
+class TORCH_API FunctionalizationReapplyViewsGuard {
+ public:
+  FunctionalizationReapplyViewsGuard(bool reapply_views)
+      : prev_(getFunctionalizationReapplyViewsTLS()) {
+    setFunctionalizationReapplyViewsTLS(reapply_views);
+  }
+  ~FunctionalizationReapplyViewsGuard() {
+    setFunctionalizationReapplyViewsTLS(prev_);
+  }
+  FunctionalizationReapplyViewsGuard(
+      const FunctionalizationReapplyViewsGuard&) = delete;
+  FunctionalizationReapplyViewsGuard operator=(
+      const FunctionalizationReapplyViewsGuard&) = delete;
+  FunctionalizationReapplyViewsGuard(FunctionalizationReapplyViewsGuard&&) =
+      delete;
+  FunctionalizationReapplyViewsGuard operator=(
+      FunctionalizationReapplyViewsGuard&&) = delete;
+ private:
+  bool prev_;
+};
+} // namespace impl
+// Helper function to call an out-of-place composite aten kernel that may use
+// mutations / views internally, and functionalize them.
+TORCH_API void functionalize_op_helper(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+template <class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _functionalize_aten_op final {};
+template <class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _functionalize_aten_op<Op, symint, ReturnType(ParameterTypes...)> final {
+  static ReturnType call(
+      typename c10::maybe_keep_symint<symint, ParameterTypes>::type... args) {
+    using FuncType = ReturnType(
+        typename c10::maybe_keep_symint<symint, ParameterTypes>::type...);
+    auto op = c10::Dispatcher::singleton()
+                  .findSchemaOrThrow(
+                      (const char*)Op::name, (const char*)Op::overload_name)
+                  .typed<FuncType>();
+    return c10::impl::BoxedKernelWrapper<FuncType>::call(
+        c10::BoxedKernel::makeFromFunction<functionalize_op_helper>(),
+        op,
+        // BoxedKernelWrapper knows to ignore this keyset argument,
+        // because functionalize_op_helper doesn't take in a DispatchKeySet
+        c10::DispatchKeySet(),
+        args...);
+  }
+};
+template <class Op>
+using functionalize_aten_op =
+    _functionalize_aten_op<Op, false, typename Op::schema>;
+template <class Op>
+using functionalize_aten_op_symint =
+    _functionalize_aten_op<Op, true, typename Op::schema>;
+} // namespace functionalization
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Functions.h ADDED Viewed

	@@ -0,0 +1,1476 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from Functions.h
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,            \
+  meaning the file will need to be re-compiled every time an operator     \
+  is changed or added. Consider if your change would be better placed in  \
+  another file, or if a more specific header might achieve the same goal. \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from <ATen/ops/{my_operator}.h> and   \
+  see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+// NOTE: [TORCH_ASSERT_ONLY_METHOD_OPERATORS]
+//
+// In ATen, certain generated headers files include the definitions of
+// every single operator in PyTorch. Unfortunately this means every
+// time an operator signature is updated or changed in
+// native_functions.yaml, you (and every other PyTorch developer) need
+// to recompile every source file that includes any of these headers.
+//
+// To break up these header dependencies, and improve incremental
+// build times for all PyTorch developers. These headers are split
+// into per-operator headers in the `ATen/ops` folder. This limits
+// incremental builds to only changes to methods of `Tensor`, or files
+// that use the specific operator being changed. With `at::sum` as an
+// example, you should include
+//
+//   <ATen/ops/sum.h>               // instead of ATen/Functions.h
+//   <ATen/ops/sum_native.h>        // instead of ATen/NativeFunctions.h
+//   <ATen/ops/sum_ops.h>           // instead of ATen/Operators.h
+//   <ATen/ops/sum_cpu_dispatch.h>  // instead of ATen/CPUFunctions.h
+//
+// However, even if you're careful to use this in your own code.
+// `Functions.h` might be included indirectly through another header
+// without you realising. To avoid this, you can add
+//
+//   #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+//
+// to the top of your source file. This way any time the non-specific
+// headers are included, the compiler will error out.
+//
+// Also, be aware that `ops` are not available in all build
+// configurations (namely fb-internal) so you must guard these
+// includes with `#ifdef AT_PER_OPERATOR_HEADERS`. e.g.
+//
+//   #ifndef AT_PER_OPERATOR_HEADERS
+//   #include <ATen/Functions.h>
+//   #else
+//   #include <ATen/ops/sum.h>
+//   #endif
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <c10/core/SymInt.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/util/OptionalArrayRef.h>
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/tensor.h>
+#include <ATen/ops/_adaptive_avg_pool2d.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward.h>
+#include <ATen/ops/_adaptive_avg_pool3d.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward.h>
+#include <ATen/ops/_add_batch_dim.h>
+#include <ATen/ops/_add_relu.h>
+#include <ATen/ops/_addmm_activation.h>
+#include <ATen/ops/_aminmax.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale.h>
+#include <ATen/ops/_amp_update_scale.h>
+#include <ATen/ops/_assert_async.h>
+#include <ATen/ops/_assert_scalar.h>
+#include <ATen/ops/_assert_tensor_metadata.h>
+#include <ATen/ops/_autocast_to_full_precision.h>
+#include <ATen/ops/_autocast_to_reduced_precision.h>
+#include <ATen/ops/_backward.h>
+#include <ATen/ops/_batch_norm_impl_index.h>
+#include <ATen/ops/_batch_norm_impl_index_backward.h>
+#include <ATen/ops/_batch_norm_no_update.h>
+#include <ATen/ops/_batch_norm_with_update.h>
+#include <ATen/ops/_cast_Byte.h>
+#include <ATen/ops/_cast_Char.h>
+#include <ATen/ops/_cast_Double.h>
+#include <ATen/ops/_cast_Float.h>
+#include <ATen/ops/_cast_Half.h>
+#include <ATen/ops/_cast_Int.h>
+#include <ATen/ops/_cast_Long.h>
+#include <ATen/ops/_cast_Short.h>
+#include <ATen/ops/_cdist_backward.h>
+#include <ATen/ops/_cdist_forward.h>
+#include <ATen/ops/_cholesky_solve_helper.h>
+#include <ATen/ops/_choose_qparams_per_tensor.h>
+#include <ATen/ops/_chunk_cat.h>
+#include <ATen/ops/_coalesce.h>
+#include <ATen/ops/_coalesced.h>
+#include <ATen/ops/_compute_linear_combination.h>
+#include <ATen/ops/_conj.h>
+#include <ATen/ops/_conj_copy.h>
+#include <ATen/ops/_conj_physical.h>
+#include <ATen/ops/_conv_depthwise2d.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo.h>
+#include <ATen/ops/_convert_weight_to_int4pack.h>
+#include <ATen/ops/_convert_weight_to_int4pack_for_cpu.h>
+#include <ATen/ops/_convolution.h>
+#include <ATen/ops/_convolution_double_backward.h>
+#include <ATen/ops/_convolution_mode.h>
+#include <ATen/ops/_copy_from.h>
+#include <ATen/ops/_copy_from_and_resize.h>
+#include <ATen/ops/_cslt_compress.h>
+#include <ATen/ops/_cslt_sparse_mm.h>
+#include <ATen/ops/_cslt_sparse_mm_search.h>
+#include <ATen/ops/_ctc_loss.h>
+#include <ATen/ops/_ctc_loss_backward.h>
+#include <ATen/ops/_cudnn_attention_backward.h>
+#include <ATen/ops/_cudnn_attention_forward.h>
+#include <ATen/ops/_cudnn_ctc_loss.h>
+#include <ATen/ops/_cudnn_init_dropout_state.h>
+#include <ATen/ops/_cudnn_rnn.h>
+#include <ATen/ops/_cudnn_rnn_backward.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight.h>
+#include <ATen/ops/_cufft_clear_plan_cache.h>
+#include <ATen/ops/_cufft_get_plan_cache_max_size.h>
+#include <ATen/ops/_cufft_get_plan_cache_size.h>
+#include <ATen/ops/_cufft_set_plan_cache_max_size.h>
+#include <ATen/ops/_cummax_helper.h>
+#include <ATen/ops/_cummin_helper.h>
+#include <ATen/ops/_debug_has_internal_overlap.h>
+#include <ATen/ops/_dimI.h>
+#include <ATen/ops/_dimV.h>
+#include <ATen/ops/_dim_arange.h>
+#include <ATen/ops/_dirichlet_grad.h>
+#include <ATen/ops/_dyn_quant_matmul_4bit.h>
+#include <ATen/ops/_dyn_quant_pack_4bit_weight.h>
+#include <ATen/ops/_efficient_attention_backward.h>
+#include <ATen/ops/_efficient_attention_forward.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_embedding_bag.h>
+#include <ATen/ops/_embedding_bag_backward.h>
+#include <ATen/ops/_embedding_bag_dense_backward.h>
+#include <ATen/ops/_embedding_bag_forward_only.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward.h>
+#include <ATen/ops/_embedding_bag_sparse_backward.h>
+#include <ATen/ops/_empty_affine_quantized.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized.h>
+#include <ATen/ops/_euclidean_dist.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_backward.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.h>
+#include <ATen/ops/_fft_c2c.h>
+#include <ATen/ops/_fft_c2r.h>
+#include <ATen/ops/_fft_r2c.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask.h>
+#include <ATen/ops/_flash_attention_backward.h>
+#include <ATen/ops/_flash_attention_forward.h>
+#include <ATen/ops/_foobar.h>
+#include <ATen/ops/_foreach_abs.h>
+#include <ATen/ops/_foreach_acos.h>
+#include <ATen/ops/_foreach_add.h>
+#include <ATen/ops/_foreach_addcdiv.h>
+#include <ATen/ops/_foreach_addcmul.h>
+#include <ATen/ops/_foreach_asin.h>
+#include <ATen/ops/_foreach_atan.h>
+#include <ATen/ops/_foreach_ceil.h>
+#include <ATen/ops/_foreach_clamp_max.h>
+#include <ATen/ops/_foreach_clamp_min.h>
+#include <ATen/ops/_foreach_copy.h>
+#include <ATen/ops/_foreach_cos.h>
+#include <ATen/ops/_foreach_cosh.h>
+#include <ATen/ops/_foreach_div.h>
+#include <ATen/ops/_foreach_erf.h>
+#include <ATen/ops/_foreach_erfc.h>
+#include <ATen/ops/_foreach_exp.h>
+#include <ATen/ops/_foreach_expm1.h>
+#include <ATen/ops/_foreach_floor.h>
+#include <ATen/ops/_foreach_frac.h>
+#include <ATen/ops/_foreach_lerp.h>
+#include <ATen/ops/_foreach_lgamma.h>
+#include <ATen/ops/_foreach_log.h>
+#include <ATen/ops/_foreach_log10.h>
+#include <ATen/ops/_foreach_log1p.h>
+#include <ATen/ops/_foreach_log2.h>
+#include <ATen/ops/_foreach_max.h>
+#include <ATen/ops/_foreach_maximum.h>
+#include <ATen/ops/_foreach_minimum.h>
+#include <ATen/ops/_foreach_mul.h>
+#include <ATen/ops/_foreach_neg.h>
+#include <ATen/ops/_foreach_norm.h>
+#include <ATen/ops/_foreach_pow.h>
+#include <ATen/ops/_foreach_reciprocal.h>
+#include <ATen/ops/_foreach_round.h>
+#include <ATen/ops/_foreach_rsqrt.h>
+#include <ATen/ops/_foreach_sigmoid.h>
+#include <ATen/ops/_foreach_sign.h>
+#include <ATen/ops/_foreach_sin.h>
+#include <ATen/ops/_foreach_sinh.h>
+#include <ATen/ops/_foreach_sqrt.h>
+#include <ATen/ops/_foreach_sub.h>
+#include <ATen/ops/_foreach_tan.h>
+#include <ATen/ops/_foreach_tanh.h>
+#include <ATen/ops/_foreach_trunc.h>
+#include <ATen/ops/_foreach_zero.h>
+#include <ATen/ops/_functional_assert_async.h>
+#include <ATen/ops/_functional_assert_scalar.h>
+#include <ATen/ops/_functional_sym_constrain_range.h>
+#include <ATen/ops/_functional_sym_constrain_range_for_size.h>
+#include <ATen/ops/_fused_adagrad.h>
+#include <ATen/ops/_fused_adam.h>
+#include <ATen/ops/_fused_adamw.h>
+#include <ATen/ops/_fused_dropout.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper.h>
+#include <ATen/ops/_fused_rms_norm.h>
+#include <ATen/ops/_fused_rms_norm_backward.h>
+#include <ATen/ops/_fused_sdp_choice.h>
+#include <ATen/ops/_fused_sgd.h>
+#include <ATen/ops/_fw_primal.h>
+#include <ATen/ops/_fw_primal_copy.h>
+#include <ATen/ops/_gather_sparse_backward.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_backward.h>
+#include <ATen/ops/_grouped_mm.h>
+#include <ATen/ops/_has_compatible_shallow_copy_type.h>
+#include <ATen/ops/_has_same_storage_numel.h>
+#include <ATen/ops/_histogramdd_bin_edges.h>
+#include <ATen/ops/_histogramdd_from_bin_cts.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors.h>
+#include <ATen/ops/_index_put_impl.h>
+#include <ATen/ops/_indices.h>
+#include <ATen/ops/_indices_copy.h>
+#include <ATen/ops/_int_mm.h>
+#include <ATen/ops/_is_all_true.h>
+#include <ATen/ops/_is_any_true.h>
+#include <ATen/ops/_is_zerotensor.h>
+#include <ATen/ops/_jagged_to_padded_dense_forward.h>
+#include <ATen/ops/_lazy_clone.h>
+#include <ATen/ops/_linalg_check_errors.h>
+#include <ATen/ops/_linalg_det.h>
+#include <ATen/ops/_linalg_eigh.h>
+#include <ATen/ops/_linalg_eigvals.h>
+#include <ATen/ops/_linalg_slogdet.h>
+#include <ATen/ops/_linalg_solve_ex.h>
+#include <ATen/ops/_linalg_svd.h>
+#include <ATen/ops/_local_scalar_dense.h>
+#include <ATen/ops/_log_softmax.h>
+#include <ATen/ops/_log_softmax_backward_data.h>
+#include <ATen/ops/_logcumsumexp.h>
+#include <ATen/ops/_lstm_mps.h>
+#include <ATen/ops/_lu_with_info.h>
+#include <ATen/ops/_make_dep_token.h>
+#include <ATen/ops/_make_dual.h>
+#include <ATen/ops/_make_dual_copy.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor.h>
+#include <ATen/ops/_masked_scale.h>
+#include <ATen/ops/_masked_softmax.h>
+#include <ATen/ops/_masked_softmax_backward.h>
+#include <ATen/ops/_mixed_dtypes_linear.h>
+#include <ATen/ops/_mkldnn_reshape.h>
+#include <ATen/ops/_mkldnn_transpose.h>
+#include <ATen/ops/_mps_convolution.h>
+#include <ATen/ops/_mps_convolution_transpose.h>
+#include <ATen/ops/_native_batch_norm_legit.h>
+#include <ATen/ops/_native_batch_norm_legit_no_training.h>
+#include <ATen/ops/_native_multi_head_attention.h>
+#include <ATen/ops/_neg_view.h>
+#include <ATen/ops/_neg_view_copy.h>
+#include <ATen/ops/_nested_compute_contiguous_strides_offsets.h>
+#include <ATen/ops/_nested_from_padded.h>
+#include <ATen/ops/_nested_from_padded_and_nested_example.h>
+#include <ATen/ops/_nested_from_padded_tensor.h>
+#include <ATen/ops/_nested_get_jagged_dummy.h>
+#include <ATen/ops/_nested_get_lengths.h>
+#include <ATen/ops/_nested_get_max_seqlen.h>
+#include <ATen/ops/_nested_get_min_seqlen.h>
+#include <ATen/ops/_nested_get_offsets.h>
+#include <ATen/ops/_nested_get_ragged_idx.h>
+#include <ATen/ops/_nested_get_values.h>
+#include <ATen/ops/_nested_get_values_copy.h>
+#include <ATen/ops/_nested_select_backward.h>
+#include <ATen/ops/_nested_sum_backward.h>
+#include <ATen/ops/_nested_tensor_from_mask.h>
+#include <ATen/ops/_nested_tensor_from_mask_left_aligned.h>
+#include <ATen/ops/_nested_tensor_from_tensor_list.h>
+#include <ATen/ops/_nested_tensor_size.h>
+#include <ATen/ops/_nested_tensor_softmax_with_shape.h>
+#include <ATen/ops/_nested_tensor_storage_offsets.h>
+#include <ATen/ops/_nested_tensor_strides.h>
+#include <ATen/ops/_nested_view_from_buffer.h>
+#include <ATen/ops/_nested_view_from_buffer_copy.h>
+#include <ATen/ops/_nested_view_from_jagged.h>
+#include <ATen/ops/_nested_view_from_jagged_copy.h>
+#include <ATen/ops/_new_zeros_with_same_feature_meta.h>
+#include <ATen/ops/_nnpack_available.h>
+#include <ATen/ops/_nnpack_spatial_convolution.h>
+#include <ATen/ops/_nnz.h>
+#include <ATen/ops/_pack_padded_sequence.h>
+#include <ATen/ops/_pack_padded_sequence_backward.h>
+#include <ATen/ops/_pad_circular.h>
+#include <ATen/ops/_pad_enum.h>
+#include <ATen/ops/_pad_packed_sequence.h>
+#include <ATen/ops/_padded_dense_to_jagged_forward.h>
+#include <ATen/ops/_pdist_backward.h>
+#include <ATen/ops/_pdist_forward.h>
+#include <ATen/ops/_pin_memory.h>
+#include <ATen/ops/_prelu_kernel.h>
+#include <ATen/ops/_prelu_kernel_backward.h>
+#include <ATen/ops/_print.h>
+#include <ATen/ops/_propagate_xla_data.h>
+#include <ATen/ops/_remove_batch_dim.h>
+#include <ATen/ops/_reshape_alias.h>
+#include <ATen/ops/_reshape_alias_copy.h>
+#include <ATen/ops/_reshape_copy.h>
+#include <ATen/ops/_reshape_from_tensor.h>
+#include <ATen/ops/_resize_output.h>
+#include <ATen/ops/_rowwise_prune.h>
+#include <ATen/ops/_safe_softmax.h>
+#include <ATen/ops/_sample_dirichlet.h>
+#include <ATen/ops/_saturate_weight_to_fp16.h>
+#include <ATen/ops/_scaled_dot_product_attention_math.h>
+#include <ATen/ops/_scaled_dot_product_attention_math_for_mps.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention_backward.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_backward.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_backward.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward.h>
+#include <ATen/ops/_scaled_dot_product_fused_attention_overrideable.h>
+#include <ATen/ops/_scaled_dot_product_fused_attention_overrideable_backward.h>
+#include <ATen/ops/_scaled_grouped_mm.h>
+#include <ATen/ops/_scaled_grouped_mm_v2.h>
+#include <ATen/ops/_scaled_mm.h>
+#include <ATen/ops/_scaled_mm_v2.h>
+#include <ATen/ops/_segment_reduce_backward.h>
+#include <ATen/ops/_shape_as_tensor.h>
+#include <ATen/ops/_slow_conv2d_backward.h>
+#include <ATen/ops/_slow_conv2d_forward.h>
+#include <ATen/ops/_sobol_engine_draw.h>
+#include <ATen/ops/_sobol_engine_ff.h>
+#include <ATen/ops/_sobol_engine_initialize_state.h>
+#include <ATen/ops/_sobol_engine_scramble.h>
+#include <ATen/ops/_softmax.h>
+#include <ATen/ops/_softmax_backward_data.h>
+#include <ATen/ops/_sparse_addmm.h>
+#include <ATen/ops/_sparse_broadcast_to.h>
+#include <ATen/ops/_sparse_broadcast_to_copy.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe.h>
+#include <ATen/ops/_sparse_compressed_tensor_with_dims.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe.h>
+#include <ATen/ops/_sparse_csr_prod.h>
+#include <ATen/ops/_sparse_csr_sum.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe.h>
+#include <ATen/ops/_sparse_log_softmax.h>
+#include <ATen/ops/_sparse_log_softmax_backward_data.h>
+#include <ATen/ops/_sparse_mask_projection.h>
+#include <ATen/ops/_sparse_mm.h>
+#include <ATen/ops/_sparse_mm_reduce_impl.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_backward.h>
+#include <ATen/ops/_sparse_semi_structured_addmm.h>
+#include <ATen/ops/_sparse_semi_structured_apply.h>
+#include <ATen/ops/_sparse_semi_structured_apply_dense.h>
+#include <ATen/ops/_sparse_semi_structured_linear.h>
+#include <ATen/ops/_sparse_semi_structured_mm.h>
+#include <ATen/ops/_sparse_semi_structured_tile.h>
+#include <ATen/ops/_sparse_softmax.h>
+#include <ATen/ops/_sparse_softmax_backward_data.h>
+#include <ATen/ops/_sparse_sparse_matmul.h>
+#include <ATen/ops/_sparse_sum.h>
+#include <ATen/ops/_sparse_sum_backward.h>
+#include <ATen/ops/_spdiags.h>
+#include <ATen/ops/_spsolve.h>
+#include <ATen/ops/_stack.h>
+#include <ATen/ops/_standard_gamma.h>
+#include <ATen/ops/_standard_gamma_grad.h>
+#include <ATen/ops/_test_ambiguous_defaults.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_copy.h>
+#include <ATen/ops/_test_check_tensor.h>
+#include <ATen/ops/_test_functorch_fallback.h>
+#include <ATen/ops/_test_optional_filled_intlist.h>
+#include <ATen/ops/_test_optional_floatlist.h>
+#include <ATen/ops/_test_optional_intlist.h>
+#include <ATen/ops/_test_parallel_materialize.h>
+#include <ATen/ops/_test_serialization_subcmul.h>
+#include <ATen/ops/_test_string_default.h>
+#include <ATen/ops/_test_warn_in_autograd.h>
+#include <ATen/ops/_thnn_differentiable_gru_cell_backward.h>
+#include <ATen/ops/_thnn_differentiable_lstm_cell_backward.h>
+#include <ATen/ops/_thnn_fused_gru_cell.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward.h>
+#include <ATen/ops/_thnn_fused_lstm_cell.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl.h>
+#include <ATen/ops/_to_copy.h>
+#include <ATen/ops/_to_cpu.h>
+#include <ATen/ops/_to_dense.h>
+#include <ATen/ops/_to_sparse.h>
+#include <ATen/ops/_to_sparse_bsc.h>
+#include <ATen/ops/_to_sparse_bsr.h>
+#include <ATen/ops/_to_sparse_csc.h>
+#include <ATen/ops/_to_sparse_csr.h>
+#include <ATen/ops/_to_sparse_semi_structured.h>
+#include <ATen/ops/_transform_bias_rescale_qkv.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd.h>
+#include <ATen/ops/_trilinear.h>
+#include <ATen/ops/_triton_multi_head_attention.h>
+#include <ATen/ops/_triton_scaled_dot_attention.h>
+#include <ATen/ops/_unique.h>
+#include <ATen/ops/_unique2.h>
+#include <ATen/ops/_unpack_dual.h>
+#include <ATen/ops/_unsafe_index.h>
+#include <ATen/ops/_unsafe_index_put.h>
+#include <ATen/ops/_unsafe_masked_index.h>
+#include <ATen/ops/_unsafe_masked_index_put_accumulate.h>
+#include <ATen/ops/_unsafe_view.h>
+#include <ATen/ops/_upsample_bicubic2d_aa.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward.h>
+#include <ATen/ops/_upsample_bilinear2d_aa.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward.h>
+#include <ATen/ops/_upsample_nearest_exact1d.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact2d.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward.h>
+#include <ATen/ops/_upsample_nearest_exact3d.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward.h>
+#include <ATen/ops/_use_cudnn_ctc_loss.h>
+#include <ATen/ops/_use_cudnn_rnn_flatten_weight.h>
+#include <ATen/ops/_validate_compressed_sparse_indices.h>
+#include <ATen/ops/_validate_sparse_bsc_tensor_args.h>
+#include <ATen/ops/_validate_sparse_bsr_tensor_args.h>
+#include <ATen/ops/_validate_sparse_compressed_tensor_args.h>
+#include <ATen/ops/_validate_sparse_coo_tensor_args.h>
+#include <ATen/ops/_validate_sparse_csc_tensor_args.h>
+#include <ATen/ops/_validate_sparse_csr_tensor_args.h>
+#include <ATen/ops/_values.h>
+#include <ATen/ops/_values_copy.h>
+#include <ATen/ops/_version.h>
+#include <ATen/ops/_weight_int4pack_mm.h>
+#include <ATen/ops/_weight_int4pack_mm_for_cpu.h>
+#include <ATen/ops/_weight_int4pack_mm_with_scales_and_zeros.h>
+#include <ATen/ops/_weight_int8pack_mm.h>
+#include <ATen/ops/_weight_norm.h>
+#include <ATen/ops/_weight_norm_differentiable_backward.h>
+#include <ATen/ops/_weight_norm_interface.h>
+#include <ATen/ops/_weight_norm_interface_backward.h>
+#include <ATen/ops/_wrapped_linear_prepack.h>
+#include <ATen/ops/_wrapped_quantized_linear_prepacked.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/absolute.h>
+#include <ATen/ops/acos.h>
+#include <ATen/ops/acosh.h>
+#include <ATen/ops/adaptive_avg_pool1d.h>
+#include <ATen/ops/adaptive_avg_pool2d.h>
+#include <ATen/ops/adaptive_avg_pool3d.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward.h>
+#include <ATen/ops/adaptive_max_pool1d.h>
+#include <ATen/ops/adaptive_max_pool2d.h>
+#include <ATen/ops/adaptive_max_pool2d_backward.h>
+#include <ATen/ops/adaptive_max_pool3d.h>
+#include <ATen/ops/adaptive_max_pool3d_backward.h>
+#include <ATen/ops/add.h>
+#include <ATen/ops/addbmm.h>
+#include <ATen/ops/addcdiv.h>
+#include <ATen/ops/addcmul.h>
+#include <ATen/ops/addmm.h>
+#include <ATen/ops/addmv.h>
+#include <ATen/ops/addr.h>
+#include <ATen/ops/adjoint.h>
+#include <ATen/ops/affine_grid_generator.h>
+#include <ATen/ops/affine_grid_generator_backward.h>
+#include <ATen/ops/alias.h>
+#include <ATen/ops/alias_copy.h>
+#include <ATen/ops/align_as.h>
+#include <ATen/ops/align_tensors.h>
+#include <ATen/ops/align_to.h>
+#include <ATen/ops/all.h>
+#include <ATen/ops/allclose.h>
+#include <ATen/ops/alpha_dropout.h>
+#include <ATen/ops/amax.h>
+#include <ATen/ops/amin.h>
+#include <ATen/ops/aminmax.h>
+#include <ATen/ops/and.h>
+#include <ATen/ops/angle.h>
+#include <ATen/ops/any.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/arccos.h>
+#include <ATen/ops/arccosh.h>
+#include <ATen/ops/arcsin.h>
+#include <ATen/ops/arcsinh.h>
+#include <ATen/ops/arctan.h>
+#include <ATen/ops/arctan2.h>
+#include <ATen/ops/arctanh.h>
+#include <ATen/ops/argmax.h>
+#include <ATen/ops/argmin.h>
+#include <ATen/ops/argsort.h>
+#include <ATen/ops/argwhere.h>
+#include <ATen/ops/as_strided.h>
+#include <ATen/ops/as_strided_copy.h>
+#include <ATen/ops/as_strided_scatter.h>
+#include <ATen/ops/asin.h>
+#include <ATen/ops/asinh.h>
+#include <ATen/ops/atan.h>
+#include <ATen/ops/atan2.h>
+#include <ATen/ops/atanh.h>
+#include <ATen/ops/atleast_1d.h>
+#include <ATen/ops/atleast_2d.h>
+#include <ATen/ops/atleast_3d.h>
+#include <ATen/ops/avg_pool1d.h>
+#include <ATen/ops/avg_pool2d.h>
+#include <ATen/ops/avg_pool2d_backward.h>
+#include <ATen/ops/avg_pool3d.h>
+#include <ATen/ops/avg_pool3d_backward.h>
+#include <ATen/ops/baddbmm.h>
+#include <ATen/ops/bartlett_window.h>
+#include <ATen/ops/batch_norm.h>
+#include <ATen/ops/batch_norm_backward.h>
+#include <ATen/ops/batch_norm_backward_elemt.h>
+#include <ATen/ops/batch_norm_backward_reduce.h>
+#include <ATen/ops/batch_norm_elemt.h>
+#include <ATen/ops/batch_norm_gather_stats.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts.h>
+#include <ATen/ops/batch_norm_stats.h>
+#include <ATen/ops/batch_norm_update_stats.h>
+#include <ATen/ops/bernoulli.h>
+#include <ATen/ops/bilinear.h>
+#include <ATen/ops/binary_cross_entropy.h>
+#include <ATen/ops/binary_cross_entropy_backward.h>
+#include <ATen/ops/binary_cross_entropy_with_logits.h>
+#include <ATen/ops/bincount.h>
+#include <ATen/ops/binomial.h>
+#include <ATen/ops/bitwise_and.h>
+#include <ATen/ops/bitwise_left_shift.h>
+#include <ATen/ops/bitwise_not.h>
+#include <ATen/ops/bitwise_or.h>
+#include <ATen/ops/bitwise_right_shift.h>
+#include <ATen/ops/bitwise_xor.h>
+#include <ATen/ops/blackman_window.h>
+#include <ATen/ops/block_diag.h>
+#include <ATen/ops/bmm.h>
+#include <ATen/ops/broadcast_tensors.h>
+#include <ATen/ops/broadcast_to.h>
+#include <ATen/ops/bucketize.h>
+#include <ATen/ops/can_cast.h>
+#include <ATen/ops/cartesian_prod.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/cauchy.h>
+#include <ATen/ops/ccol_indices.h>
+#include <ATen/ops/ccol_indices_copy.h>
+#include <ATen/ops/cdist.h>
+#include <ATen/ops/ceil.h>
+#include <ATen/ops/celu.h>
+#include <ATen/ops/chain_matmul.h>
+#include <ATen/ops/chalf.h>
+#include <ATen/ops/channel_shuffle.h>
+#include <ATen/ops/cholesky.h>
+#include <ATen/ops/cholesky_inverse.h>
+#include <ATen/ops/cholesky_solve.h>
+#include <ATen/ops/choose_qparams_optimized.h>
+#include <ATen/ops/chunk.h>
+#include <ATen/ops/clamp.h>
+#include <ATen/ops/clamp_max.h>
+#include <ATen/ops/clamp_min.h>
+#include <ATen/ops/clip.h>
+#include <ATen/ops/clone.h>
+#include <ATen/ops/coalesce.h>
+#include <ATen/ops/col2im.h>
+#include <ATen/ops/col_indices.h>
+#include <ATen/ops/col_indices_copy.h>
+#include <ATen/ops/column_stack.h>
+#include <ATen/ops/combinations.h>
+#include <ATen/ops/complex.h>
+#include <ATen/ops/concat.h>
+#include <ATen/ops/concatenate.h>
+#include <ATen/ops/conj.h>
+#include <ATen/ops/conj_physical.h>
+#include <ATen/ops/constant_pad_nd.h>
+#include <ATen/ops/contiguous.h>
+#include <ATen/ops/conv1d.h>
+#include <ATen/ops/conv2d.h>
+#include <ATen/ops/conv3d.h>
+#include <ATen/ops/conv_depthwise3d.h>
+#include <ATen/ops/conv_tbc.h>
+#include <ATen/ops/conv_tbc_backward.h>
+#include <ATen/ops/conv_transpose1d.h>
+#include <ATen/ops/conv_transpose2d.h>
+#include <ATen/ops/conv_transpose3d.h>
+#include <ATen/ops/convolution.h>
+#include <ATen/ops/convolution_backward.h>
+#include <ATen/ops/convolution_backward_overrideable.h>
+#include <ATen/ops/convolution_overrideable.h>
+#include <ATen/ops/copy.h>
+#include <ATen/ops/copy_sparse_to_sparse.h>
+#include <ATen/ops/copysign.h>
+#include <ATen/ops/corrcoef.h>
+#include <ATen/ops/cos.h>
+#include <ATen/ops/cosh.h>
+#include <ATen/ops/cosine_embedding_loss.h>
+#include <ATen/ops/cosine_similarity.h>
+#include <ATen/ops/count_nonzero.h>
+#include <ATen/ops/cov.h>
+#include <ATen/ops/cross.h>
+#include <ATen/ops/cross_entropy_loss.h>
+#include <ATen/ops/crow_indices.h>
+#include <ATen/ops/crow_indices_copy.h>
+#include <ATen/ops/ctc_loss.h>
+#include <ATen/ops/cudnn_affine_grid_generator.h>
+#include <ATen/ops/cudnn_affine_grid_generator_backward.h>
+#include <ATen/ops/cudnn_batch_norm.h>
+#include <ATen/ops/cudnn_batch_norm_backward.h>
+#include <ATen/ops/cudnn_convolution.h>
+#include <ATen/ops/cudnn_convolution_add_relu.h>
+#include <ATen/ops/cudnn_convolution_relu.h>
+#include <ATen/ops/cudnn_convolution_transpose.h>
+#include <ATen/ops/cudnn_grid_sampler.h>
+#include <ATen/ops/cudnn_grid_sampler_backward.h>
+#include <ATen/ops/cudnn_is_acceptable.h>
+#include <ATen/ops/cummax.h>
+#include <ATen/ops/cummaxmin_backward.h>
+#include <ATen/ops/cummin.h>
+#include <ATen/ops/cumprod.h>
+#include <ATen/ops/cumprod_backward.h>
+#include <ATen/ops/cumsum.h>
+#include <ATen/ops/cumulative_trapezoid.h>
+#include <ATen/ops/data.h>
+#include <ATen/ops/deg2rad.h>
+#include <ATen/ops/dense_dim.h>
+#include <ATen/ops/dequantize.h>
+#include <ATen/ops/det.h>
+#include <ATen/ops/detach.h>
+#include <ATen/ops/detach_copy.h>
+#include <ATen/ops/diag.h>
+#include <ATen/ops/diag_embed.h>
+#include <ATen/ops/diagflat.h>
+#include <ATen/ops/diagonal.h>
+#include <ATen/ops/diagonal_backward.h>
+#include <ATen/ops/diagonal_copy.h>
+#include <ATen/ops/diagonal_scatter.h>
+#include <ATen/ops/diff.h>
+#include <ATen/ops/digamma.h>
+#include <ATen/ops/dist.h>
+#include <ATen/ops/div.h>
+#include <ATen/ops/divide.h>
+#include <ATen/ops/dot.h>
+#include <ATen/ops/dropout.h>
+#include <ATen/ops/dsplit.h>
+#include <ATen/ops/dstack.h>
+#include <ATen/ops/einsum.h>
+#include <ATen/ops/elu.h>
+#include <ATen/ops/elu_backward.h>
+#include <ATen/ops/embedding.h>
+#include <ATen/ops/embedding_backward.h>
+#include <ATen/ops/embedding_bag.h>
+#include <ATen/ops/embedding_dense_backward.h>
+#include <ATen/ops/embedding_renorm.h>
+#include <ATen/ops/embedding_sparse_backward.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_permuted.h>
+#include <ATen/ops/empty_quantized.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/eq.h>
+#include <ATen/ops/equal.h>
+#include <ATen/ops/erf.h>
+#include <ATen/ops/erfc.h>
+#include <ATen/ops/erfinv.h>
+#include <ATen/ops/exp.h>
+#include <ATen/ops/exp2.h>
+#include <ATen/ops/expand.h>
+#include <ATen/ops/expand_as.h>
+#include <ATen/ops/expand_copy.h>
+#include <ATen/ops/expm1.h>
+#include <ATen/ops/exponential.h>
+#include <ATen/ops/eye.h>
+#include <ATen/ops/fake_quantize_per_channel_affine.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_backward.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation.h>
+#include <ATen/ops/fbgemm_linear_int8_weight.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_fp32_activation.h>
+#include <ATen/ops/fbgemm_linear_quantize_weight.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16.h>
+#include <ATen/ops/fbgemm_pack_quantized_matrix.h>
+#include <ATen/ops/feature_alpha_dropout.h>
+#include <ATen/ops/feature_dropout.h>
+#include <ATen/ops/fft_fft.h>
+#include <ATen/ops/fft_fft2.h>
+#include <ATen/ops/fft_fftfreq.h>
+#include <ATen/ops/fft_fftn.h>
+#include <ATen/ops/fft_fftshift.h>
+#include <ATen/ops/fft_hfft.h>
+#include <ATen/ops/fft_hfft2.h>
+#include <ATen/ops/fft_hfftn.h>
+#include <ATen/ops/fft_ifft.h>
+#include <ATen/ops/fft_ifft2.h>
+#include <ATen/ops/fft_ifftn.h>
+#include <ATen/ops/fft_ifftshift.h>
+#include <ATen/ops/fft_ihfft.h>
+#include <ATen/ops/fft_ihfft2.h>
+#include <ATen/ops/fft_ihfftn.h>
+#include <ATen/ops/fft_irfft.h>
+#include <ATen/ops/fft_irfft2.h>
+#include <ATen/ops/fft_irfftn.h>
+#include <ATen/ops/fft_rfft.h>
+#include <ATen/ops/fft_rfft2.h>
+#include <ATen/ops/fft_rfftfreq.h>
+#include <ATen/ops/fft_rfftn.h>
+#include <ATen/ops/fill.h>
+#include <ATen/ops/fill_diagonal.h>
+#include <ATen/ops/fix.h>
+#include <ATen/ops/flatten.h>
+#include <ATen/ops/flatten_dense_tensors.h>
+#include <ATen/ops/flip.h>
+#include <ATen/ops/fliplr.h>
+#include <ATen/ops/flipud.h>
+#include <ATen/ops/float_power.h>
+#include <ATen/ops/floor.h>
+#include <ATen/ops/floor_divide.h>
+#include <ATen/ops/fmax.h>
+#include <ATen/ops/fmin.h>
+#include <ATen/ops/fmod.h>
+#include <ATen/ops/frac.h>
+#include <ATen/ops/fractional_max_pool2d.h>
+#include <ATen/ops/fractional_max_pool2d_backward.h>
+#include <ATen/ops/fractional_max_pool3d.h>
+#include <ATen/ops/fractional_max_pool3d_backward.h>
+#include <ATen/ops/frexp.h>
+#include <ATen/ops/frobenius_norm.h>
+#include <ATen/ops/from_file.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/full_like.h>
+#include <ATen/ops/fused_moving_avg_obs_fake_quant.h>
+#include <ATen/ops/gather.h>
+#include <ATen/ops/gather_backward.h>
+#include <ATen/ops/gcd.h>
+#include <ATen/ops/ge.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/gelu_backward.h>
+#include <ATen/ops/geometric.h>
+#include <ATen/ops/geqrf.h>
+#include <ATen/ops/ger.h>
+#include <ATen/ops/glu.h>
+#include <ATen/ops/glu_backward.h>
+#include <ATen/ops/glu_backward_jvp.h>
+#include <ATen/ops/glu_jvp.h>
+#include <ATen/ops/gradient.h>
+#include <ATen/ops/greater.h>
+#include <ATen/ops/greater_equal.h>
+#include <ATen/ops/grid_sampler.h>
+#include <ATen/ops/grid_sampler_2d.h>
+#include <ATen/ops/grid_sampler_2d_backward.h>
+#include <ATen/ops/grid_sampler_3d.h>
+#include <ATen/ops/grid_sampler_3d_backward.h>
+#include <ATen/ops/group_norm.h>
+#include <ATen/ops/gru.h>
+#include <ATen/ops/gru_cell.h>
+#include <ATen/ops/gt.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/hann_window.h>
+#include <ATen/ops/hardshrink.h>
+#include <ATen/ops/hardshrink_backward.h>
+#include <ATen/ops/hardsigmoid.h>
+#include <ATen/ops/hardsigmoid_backward.h>
+#include <ATen/ops/hardswish.h>
+#include <ATen/ops/hardswish_backward.h>
+#include <ATen/ops/hardtanh.h>
+#include <ATen/ops/hardtanh_backward.h>
+#include <ATen/ops/hash_tensor.h>
+#include <ATen/ops/heaviside.h>
+#include <ATen/ops/hinge_embedding_loss.h>
+#include <ATen/ops/histc.h>
+#include <ATen/ops/histogram.h>
+#include <ATen/ops/histogramdd.h>
+#include <ATen/ops/hsplit.h>
+#include <ATen/ops/hspmm.h>
+#include <ATen/ops/hstack.h>
+#include <ATen/ops/huber_loss.h>
+#include <ATen/ops/huber_loss_backward.h>
+#include <ATen/ops/hypot.h>
+#include <ATen/ops/i0.h>
+#include <ATen/ops/igamma.h>
+#include <ATen/ops/igammac.h>
+#include <ATen/ops/im2col.h>
+#include <ATen/ops/imag.h>
+#include <ATen/ops/index.h>
+#include <ATen/ops/index_add.h>
+#include <ATen/ops/index_copy.h>
+#include <ATen/ops/index_fill.h>
+#include <ATen/ops/index_put.h>
+#include <ATen/ops/index_reduce.h>
+#include <ATen/ops/index_select.h>
+#include <ATen/ops/index_select_backward.h>
+#include <ATen/ops/indices.h>
+#include <ATen/ops/indices_copy.h>
+#include <ATen/ops/infinitely_differentiable_gelu_backward.h>
+#include <ATen/ops/inner.h>
+#include <ATen/ops/instance_norm.h>
+#include <ATen/ops/int_repr.h>
+#include <ATen/ops/inverse.h>
+#include <ATen/ops/is_coalesced.h>
+#include <ATen/ops/is_complex.h>
+#include <ATen/ops/is_conj.h>
+#include <ATen/ops/is_distributed.h>
+#include <ATen/ops/is_floating_point.h>
+#include <ATen/ops/is_inference.h>
+#include <ATen/ops/is_leaf.h>
+#include <ATen/ops/is_neg.h>
+#include <ATen/ops/is_nonzero.h>
+#include <ATen/ops/is_pinned.h>
+#include <ATen/ops/is_same_size.h>
+#include <ATen/ops/is_set_to.h>
+#include <ATen/ops/is_signed.h>
+#include <ATen/ops/is_vulkan_available.h>
+#include <ATen/ops/isclose.h>
+#include <ATen/ops/isfinite.h>
+#include <ATen/ops/isin.h>
+#include <ATen/ops/isinf.h>
+#include <ATen/ops/isnan.h>
+#include <ATen/ops/isneginf.h>
+#include <ATen/ops/isposinf.h>
+#include <ATen/ops/isreal.h>
+#include <ATen/ops/istft.h>
+#include <ATen/ops/item.h>
+#include <ATen/ops/kaiser_window.h>
+#include <ATen/ops/kl_div.h>
+#include <ATen/ops/kron.h>
+#include <ATen/ops/kthvalue.h>
+#include <ATen/ops/l1_loss.h>
+#include <ATen/ops/layer_norm.h>
+#include <ATen/ops/lcm.h>
+#include <ATen/ops/ldexp.h>
+#include <ATen/ops/le.h>
+#include <ATen/ops/leaky_relu.h>
+#include <ATen/ops/leaky_relu_backward.h>
+#include <ATen/ops/lerp.h>
+#include <ATen/ops/less.h>
+#include <ATen/ops/less_equal.h>
+#include <ATen/ops/lgamma.h>
+#include <ATen/ops/lift.h>
+#include <ATen/ops/lift_fresh.h>
+#include <ATen/ops/lift_fresh_copy.h>
+#include <ATen/ops/linalg_cholesky.h>
+#include <ATen/ops/linalg_cholesky_ex.h>
+#include <ATen/ops/linalg_cond.h>
+#include <ATen/ops/linalg_cross.h>
+#include <ATen/ops/linalg_det.h>
+#include <ATen/ops/linalg_diagonal.h>
+#include <ATen/ops/linalg_eig.h>
+#include <ATen/ops/linalg_eigh.h>
+#include <ATen/ops/linalg_eigvals.h>
+#include <ATen/ops/linalg_eigvalsh.h>
+#include <ATen/ops/linalg_householder_product.h>
+#include <ATen/ops/linalg_inv.h>
+#include <ATen/ops/linalg_inv_ex.h>
+#include <ATen/ops/linalg_ldl_factor.h>
+#include <ATen/ops/linalg_ldl_factor_ex.h>
+#include <ATen/ops/linalg_ldl_solve.h>
+#include <ATen/ops/linalg_lstsq.h>
+#include <ATen/ops/linalg_lu.h>
+#include <ATen/ops/linalg_lu_factor.h>
+#include <ATen/ops/linalg_lu_factor_ex.h>
+#include <ATen/ops/linalg_lu_solve.h>
+#include <ATen/ops/linalg_matmul.h>
+#include <ATen/ops/linalg_matrix_exp.h>
+#include <ATen/ops/linalg_matrix_norm.h>
+#include <ATen/ops/linalg_matrix_power.h>
+#include <ATen/ops/linalg_matrix_rank.h>
+#include <ATen/ops/linalg_multi_dot.h>
+#include <ATen/ops/linalg_norm.h>
+#include <ATen/ops/linalg_pinv.h>
+#include <ATen/ops/linalg_qr.h>
+#include <ATen/ops/linalg_slogdet.h>
+#include <ATen/ops/linalg_solve.h>
+#include <ATen/ops/linalg_solve_ex.h>
+#include <ATen/ops/linalg_solve_triangular.h>
+#include <ATen/ops/linalg_svd.h>
+#include <ATen/ops/linalg_svdvals.h>
+#include <ATen/ops/linalg_tensorinv.h>
+#include <ATen/ops/linalg_tensorsolve.h>
+#include <ATen/ops/linalg_vander.h>
+#include <ATen/ops/linalg_vecdot.h>
+#include <ATen/ops/linalg_vector_norm.h>
+#include <ATen/ops/linear.h>
+#include <ATen/ops/linear_backward.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/log.h>
+#include <ATen/ops/log10.h>
+#include <ATen/ops/log1p.h>
+#include <ATen/ops/log2.h>
+#include <ATen/ops/log_normal.h>
+#include <ATen/ops/log_sigmoid.h>
+#include <ATen/ops/log_sigmoid_backward.h>
+#include <ATen/ops/log_sigmoid_forward.h>
+#include <ATen/ops/log_softmax.h>
+#include <ATen/ops/logaddexp.h>
+#include <ATen/ops/logaddexp2.h>
+#include <ATen/ops/logcumsumexp.h>
+#include <ATen/ops/logdet.h>
+#include <ATen/ops/logical_and.h>
+#include <ATen/ops/logical_not.h>
+#include <ATen/ops/logical_or.h>
+#include <ATen/ops/logical_xor.h>
+#include <ATen/ops/logit.h>
+#include <ATen/ops/logit_backward.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/logsumexp.h>
+#include <ATen/ops/lshift.h>
+#include <ATen/ops/lstm.h>
+#include <ATen/ops/lstm_cell.h>
+#include <ATen/ops/lstm_mps_backward.h>
+#include <ATen/ops/lt.h>
+#include <ATen/ops/lu_solve.h>
+#include <ATen/ops/lu_unpack.h>
+#include <ATen/ops/mH.h>
+#include <ATen/ops/mT.h>
+#include <ATen/ops/margin_ranking_loss.h>
+#include <ATen/ops/masked_fill.h>
+#include <ATen/ops/masked_scatter.h>
+#include <ATen/ops/masked_scatter_backward.h>
+#include <ATen/ops/masked_select.h>
+#include <ATen/ops/masked_select_backward.h>
+#include <ATen/ops/matmul.h>
+#include <ATen/ops/matmul_backward.h>
+#include <ATen/ops/matrix_H.h>
+#include <ATen/ops/matrix_exp.h>
+#include <ATen/ops/matrix_exp_backward.h>
+#include <ATen/ops/matrix_power.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/max_pool1d.h>
+#include <ATen/ops/max_pool1d_with_indices.h>
+#include <ATen/ops/max_pool2d.h>
+#include <ATen/ops/max_pool2d_backward.h>
+#include <ATen/ops/max_pool2d_with_indices.h>
+#include <ATen/ops/max_pool2d_with_indices_backward.h>
+#include <ATen/ops/max_pool3d.h>
+#include <ATen/ops/max_pool3d_with_indices.h>
+#include <ATen/ops/max_pool3d_with_indices_backward.h>
+#include <ATen/ops/max_unpool2d.h>
+#include <ATen/ops/max_unpool3d.h>
+#include <ATen/ops/maximum.h>
+#include <ATen/ops/mean.h>
+#include <ATen/ops/median.h>
+#include <ATen/ops/meshgrid.h>
+#include <ATen/ops/min.h>
+#include <ATen/ops/minimum.h>
+#include <ATen/ops/miopen_batch_norm.h>
+#include <ATen/ops/miopen_batch_norm_backward.h>
+#include <ATen/ops/miopen_convolution.h>
+#include <ATen/ops/miopen_convolution_add_relu.h>
+#include <ATen/ops/miopen_convolution_relu.h>
+#include <ATen/ops/miopen_convolution_transpose.h>
+#include <ATen/ops/miopen_depthwise_convolution.h>
+#include <ATen/ops/miopen_rnn.h>
+#include <ATen/ops/miopen_rnn_backward.h>
+#include <ATen/ops/mish.h>
+#include <ATen/ops/mish_backward.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_backward.h>
+#include <ATen/ops/mkldnn_convolution.h>
+#include <ATen/ops/mkldnn_linear.h>
+#include <ATen/ops/mkldnn_linear_backward.h>
+#include <ATen/ops/mkldnn_linear_backward_input.h>
+#include <ATen/ops/mkldnn_linear_backward_weights.h>
+#include <ATen/ops/mkldnn_max_pool2d.h>
+#include <ATen/ops/mkldnn_max_pool2d_backward.h>
+#include <ATen/ops/mkldnn_max_pool3d.h>
+#include <ATen/ops/mkldnn_max_pool3d_backward.h>
+#include <ATen/ops/mkldnn_reorder_conv2d_weight.h>
+#include <ATen/ops/mkldnn_reorder_conv3d_weight.h>
+#include <ATen/ops/mkldnn_rnn_layer.h>
+#include <ATen/ops/mkldnn_rnn_layer_backward.h>
+#include <ATen/ops/mm.h>
+#include <ATen/ops/mode.h>
+#include <ATen/ops/moveaxis.h>
+#include <ATen/ops/movedim.h>
+#include <ATen/ops/mps_convolution_backward.h>
+#include <ATen/ops/mps_convolution_transpose_backward.h>
+#include <ATen/ops/mse_loss.h>
+#include <ATen/ops/mse_loss_backward.h>
+#include <ATen/ops/msort.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/multi_margin_loss.h>
+#include <ATen/ops/multi_margin_loss_backward.h>
+#include <ATen/ops/multilabel_margin_loss.h>
+#include <ATen/ops/multilabel_margin_loss_backward.h>
+#include <ATen/ops/multilabel_margin_loss_forward.h>
+#include <ATen/ops/multinomial.h>
+#include <ATen/ops/multiply.h>
+#include <ATen/ops/mv.h>
+#include <ATen/ops/mvlgamma.h>
+#include <ATen/ops/nan_to_num.h>
+#include <ATen/ops/nanmean.h>
+#include <ATen/ops/nanmedian.h>
+#include <ATen/ops/nanquantile.h>
+#include <ATen/ops/nansum.h>
+#include <ATen/ops/narrow.h>
+#include <ATen/ops/narrow_copy.h>
+#include <ATen/ops/native_batch_norm.h>
+#include <ATen/ops/native_batch_norm_backward.h>
+#include <ATen/ops/native_channel_shuffle.h>
+#include <ATen/ops/native_dropout.h>
+#include <ATen/ops/native_dropout_backward.h>
+#include <ATen/ops/native_group_norm.h>
+#include <ATen/ops/native_group_norm_backward.h>
+#include <ATen/ops/native_layer_norm.h>
+#include <ATen/ops/native_layer_norm_backward.h>
+#include <ATen/ops/native_norm.h>
+#include <ATen/ops/ne.h>
+#include <ATen/ops/neg.h>
+#include <ATen/ops/negative.h>
+#include <ATen/ops/nested_to_padded_tensor.h>
+#include <ATen/ops/new_empty.h>
+#include <ATen/ops/new_empty_strided.h>
+#include <ATen/ops/new_full.h>
+#include <ATen/ops/new_ones.h>
+#include <ATen/ops/new_zeros.h>
+#include <ATen/ops/nextafter.h>
+#include <ATen/ops/nll_loss.h>
+#include <ATen/ops/nll_loss2d.h>
+#include <ATen/ops/nll_loss2d_backward.h>
+#include <ATen/ops/nll_loss2d_forward.h>
+#include <ATen/ops/nll_loss_backward.h>
+#include <ATen/ops/nll_loss_forward.h>
+#include <ATen/ops/nll_loss_nd.h>
+#include <ATen/ops/nonzero.h>
+#include <ATen/ops/nonzero_numpy.h>
+#include <ATen/ops/nonzero_static.h>
+#include <ATen/ops/norm.h>
+#include <ATen/ops/norm_except_dim.h>
+#include <ATen/ops/normal.h>
+#include <ATen/ops/not_equal.h>
+#include <ATen/ops/nuclear_norm.h>
+#include <ATen/ops/numpy_T.h>
+#include <ATen/ops/one_hot.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/or.h>
+#include <ATen/ops/orgqr.h>
+#include <ATen/ops/ormqr.h>
+#include <ATen/ops/outer.h>
+#include <ATen/ops/output_nr.h>
+#include <ATen/ops/pad.h>
+#include <ATen/ops/pad_sequence.h>
+#include <ATen/ops/pairwise_distance.h>
+#include <ATen/ops/pdist.h>
+#include <ATen/ops/permute.h>
+#include <ATen/ops/permute_copy.h>
+#include <ATen/ops/pin_memory.h>
+#include <ATen/ops/pinverse.h>
+#include <ATen/ops/pixel_shuffle.h>
+#include <ATen/ops/pixel_unshuffle.h>
+#include <ATen/ops/poisson.h>
+#include <ATen/ops/poisson_nll_loss.h>
+#include <ATen/ops/polar.h>
+#include <ATen/ops/polygamma.h>
+#include <ATen/ops/positive.h>
+#include <ATen/ops/pow.h>
+#include <ATen/ops/prelu.h>
+#include <ATen/ops/prod.h>
+#include <ATen/ops/promote_types.h>
+#include <ATen/ops/put.h>
+#include <ATen/ops/q_per_channel_axis.h>
+#include <ATen/ops/q_per_channel_scales.h>
+#include <ATen/ops/q_per_channel_zero_points.h>
+#include <ATen/ops/q_scale.h>
+#include <ATen/ops/q_zero_point.h>
+#include <ATen/ops/qr.h>
+#include <ATen/ops/qscheme.h>
+#include <ATen/ops/quantile.h>
+#include <ATen/ops/quantize_per_channel.h>
+#include <ATen/ops/quantize_per_tensor.h>
+#include <ATen/ops/quantize_per_tensor_dynamic.h>
+#include <ATen/ops/quantized_batch_norm.h>
+#include <ATen/ops/quantized_gru_cell.h>
+#include <ATen/ops/quantized_lstm_cell.h>
+#include <ATen/ops/quantized_max_pool1d.h>
+#include <ATen/ops/quantized_max_pool2d.h>
+#include <ATen/ops/quantized_max_pool3d.h>
+#include <ATen/ops/quantized_rnn_relu_cell.h>
+#include <ATen/ops/quantized_rnn_tanh_cell.h>
+#include <ATen/ops/rad2deg.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand_like.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint_like.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn_like.h>
+#include <ATen/ops/random.h>
+#include <ATen/ops/randperm.h>
+#include <ATen/ops/range.h>
+#include <ATen/ops/ravel.h>
+#include <ATen/ops/real.h>
+#include <ATen/ops/reciprocal.h>
+#include <ATen/ops/record_stream.h>
+#include <ATen/ops/refine_names.h>
+#include <ATen/ops/reflection_pad1d.h>
+#include <ATen/ops/reflection_pad1d_backward.h>
+#include <ATen/ops/reflection_pad2d.h>
+#include <ATen/ops/reflection_pad2d_backward.h>
+#include <ATen/ops/reflection_pad3d.h>
+#include <ATen/ops/reflection_pad3d_backward.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/relu6.h>
+#include <ATen/ops/remainder.h>
+#include <ATen/ops/rename.h>
+#include <ATen/ops/renorm.h>
+#include <ATen/ops/repeat.h>
+#include <ATen/ops/repeat_interleave.h>
+#include <ATen/ops/replication_pad1d.h>
+#include <ATen/ops/replication_pad1d_backward.h>
+#include <ATen/ops/replication_pad2d.h>
+#include <ATen/ops/replication_pad2d_backward.h>
+#include <ATen/ops/replication_pad3d.h>
+#include <ATen/ops/replication_pad3d_backward.h>
+#include <ATen/ops/requires_grad.h>
+#include <ATen/ops/reshape.h>
+#include <ATen/ops/reshape_as.h>
+#include <ATen/ops/resize.h>
+#include <ATen/ops/resize_as.h>
+#include <ATen/ops/resize_as_sparse.h>
+#include <ATen/ops/resolve_conj.h>
+#include <ATen/ops/resolve_neg.h>
+#include <ATen/ops/result_type.h>
+#include <ATen/ops/retain_grad.h>
+#include <ATen/ops/retains_grad.h>
+#include <ATen/ops/rms_norm.h>
+#include <ATen/ops/rnn_relu.h>
+#include <ATen/ops/rnn_relu_cell.h>
+#include <ATen/ops/rnn_tanh.h>
+#include <ATen/ops/rnn_tanh_cell.h>
+#include <ATen/ops/roll.h>
+#include <ATen/ops/rot90.h>
+#include <ATen/ops/round.h>
+#include <ATen/ops/row_indices.h>
+#include <ATen/ops/row_indices_copy.h>
+#include <ATen/ops/row_stack.h>
+#include <ATen/ops/rrelu.h>
+#include <ATen/ops/rrelu_with_noise.h>
+#include <ATen/ops/rrelu_with_noise_backward.h>
+#include <ATen/ops/rshift.h>
+#include <ATen/ops/rsqrt.h>
+#include <ATen/ops/rsub.h>
+#include <ATen/ops/scalar_tensor.h>
+#include <ATen/ops/scaled_dot_product_attention.h>
+#include <ATen/ops/scatter.h>
+#include <ATen/ops/scatter_add.h>
+#include <ATen/ops/scatter_reduce.h>
+#include <ATen/ops/searchsorted.h>
+#include <ATen/ops/segment_reduce.h>
+#include <ATen/ops/select.h>
+#include <ATen/ops/select_backward.h>
+#include <ATen/ops/select_copy.h>
+#include <ATen/ops/select_scatter.h>
+#include <ATen/ops/selu.h>
+#include <ATen/ops/set.h>
+#include <ATen/ops/set_data.h>
+#include <ATen/ops/sgn.h>
+#include <ATen/ops/sigmoid.h>
+#include <ATen/ops/sigmoid_backward.h>
+#include <ATen/ops/sign.h>
+#include <ATen/ops/signbit.h>
+#include <ATen/ops/silu.h>
+#include <ATen/ops/silu_backward.h>
+#include <ATen/ops/sin.h>
+#include <ATen/ops/sinc.h>
+#include <ATen/ops/sinh.h>
+#include <ATen/ops/size.h>
+#include <ATen/ops/slice.h>
+#include <ATen/ops/slice_backward.h>
+#include <ATen/ops/slice_copy.h>
+#include <ATen/ops/slice_inverse.h>
+#include <ATen/ops/slice_scatter.h>
+#include <ATen/ops/slogdet.h>
+#include <ATen/ops/slow_conv3d.h>
+#include <ATen/ops/slow_conv3d_forward.h>
+#include <ATen/ops/slow_conv_dilated2d.h>
+#include <ATen/ops/slow_conv_dilated3d.h>
+#include <ATen/ops/slow_conv_transpose2d.h>
+#include <ATen/ops/slow_conv_transpose3d.h>
+#include <ATen/ops/smm.h>
+#include <ATen/ops/smooth_l1_loss.h>
+#include <ATen/ops/smooth_l1_loss_backward.h>
+#include <ATen/ops/soft_margin_loss.h>
+#include <ATen/ops/soft_margin_loss_backward.h>
+#include <ATen/ops/softmax.h>
+#include <ATen/ops/softplus.h>
+#include <ATen/ops/softplus_backward.h>
+#include <ATen/ops/softshrink.h>
+#include <ATen/ops/softshrink_backward.h>
+#include <ATen/ops/sort.h>
+#include <ATen/ops/sparse_bsc_tensor.h>
+#include <ATen/ops/sparse_bsr_tensor.h>
+#include <ATen/ops/sparse_compressed_tensor.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#include <ATen/ops/sparse_csc_tensor.h>
+#include <ATen/ops/sparse_csr_tensor.h>
+#include <ATen/ops/sparse_dim.h>
+#include <ATen/ops/sparse_mask.h>
+#include <ATen/ops/sparse_resize.h>
+#include <ATen/ops/sparse_resize_and_clear.h>
+#include <ATen/ops/sparse_sampled_addmm.h>
+#include <ATen/ops/special_airy_ai.h>
+#include <ATen/ops/special_bessel_j0.h>
+#include <ATen/ops/special_bessel_j1.h>
+#include <ATen/ops/special_bessel_y0.h>
+#include <ATen/ops/special_bessel_y1.h>
+#include <ATen/ops/special_chebyshev_polynomial_t.h>
+#include <ATen/ops/special_chebyshev_polynomial_u.h>
+#include <ATen/ops/special_chebyshev_polynomial_v.h>
+#include <ATen/ops/special_chebyshev_polynomial_w.h>
+#include <ATen/ops/special_digamma.h>
+#include <ATen/ops/special_entr.h>
+#include <ATen/ops/special_erf.h>
+#include <ATen/ops/special_erfc.h>
+#include <ATen/ops/special_erfcx.h>
+#include <ATen/ops/special_erfinv.h>
+#include <ATen/ops/special_exp2.h>
+#include <ATen/ops/special_expit.h>
+#include <ATen/ops/special_expm1.h>
+#include <ATen/ops/special_gammainc.h>
+#include <ATen/ops/special_gammaincc.h>
+#include <ATen/ops/special_gammaln.h>
+#include <ATen/ops/special_hermite_polynomial_h.h>
+#include <ATen/ops/special_hermite_polynomial_he.h>
+#include <ATen/ops/special_i0.h>
+#include <ATen/ops/special_i0e.h>
+#include <ATen/ops/special_i1.h>
+#include <ATen/ops/special_i1e.h>
+#include <ATen/ops/special_laguerre_polynomial_l.h>
+#include <ATen/ops/special_legendre_polynomial_p.h>
+#include <ATen/ops/special_log1p.h>
+#include <ATen/ops/special_log_ndtr.h>
+#include <ATen/ops/special_log_softmax.h>
+#include <ATen/ops/special_logit.h>
+#include <ATen/ops/special_logsumexp.h>
+#include <ATen/ops/special_modified_bessel_i0.h>
+#include <ATen/ops/special_modified_bessel_i1.h>
+#include <ATen/ops/special_modified_bessel_k0.h>
+#include <ATen/ops/special_modified_bessel_k1.h>
+#include <ATen/ops/special_multigammaln.h>
+#include <ATen/ops/special_ndtr.h>
+#include <ATen/ops/special_ndtri.h>
+#include <ATen/ops/special_polygamma.h>
+#include <ATen/ops/special_psi.h>
+#include <ATen/ops/special_round.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w.h>
+#include <ATen/ops/special_sinc.h>
+#include <ATen/ops/special_softmax.h>
+#include <ATen/ops/special_spherical_bessel_j0.h>
+#include <ATen/ops/special_xlog1py.h>
+#include <ATen/ops/special_xlogy.h>
+#include <ATen/ops/special_zeta.h>
+#include <ATen/ops/split.h>
+#include <ATen/ops/split_copy.h>
+#include <ATen/ops/split_with_sizes.h>
+#include <ATen/ops/split_with_sizes_copy.h>
+#include <ATen/ops/sqrt.h>
+#include <ATen/ops/square.h>
+#include <ATen/ops/squeeze.h>
+#include <ATen/ops/squeeze_copy.h>
+#include <ATen/ops/sspaddmm.h>
+#include <ATen/ops/stack.h>
+#include <ATen/ops/std.h>
+#include <ATen/ops/std_mean.h>
+#include <ATen/ops/stft.h>
+#include <ATen/ops/stride.h>
+#include <ATen/ops/sub.h>
+#include <ATen/ops/subtract.h>
+#include <ATen/ops/sum.h>
+#include <ATen/ops/sum_to_size.h>
+#include <ATen/ops/svd.h>
+#include <ATen/ops/swapaxes.h>
+#include <ATen/ops/swapdims.h>
+#include <ATen/ops/sym_constrain_range.h>
+#include <ATen/ops/sym_constrain_range_for_size.h>
+#include <ATen/ops/sym_is_contiguous.h>
+#include <ATen/ops/sym_numel.h>
+#include <ATen/ops/sym_size.h>
+#include <ATen/ops/sym_storage_offset.h>
+#include <ATen/ops/sym_stride.h>
+#include <ATen/ops/t.h>
+#include <ATen/ops/t_copy.h>
+#include <ATen/ops/take.h>
+#include <ATen/ops/take_along_dim.h>
+#include <ATen/ops/tan.h>
+#include <ATen/ops/tanh.h>
+#include <ATen/ops/tanh_backward.h>
+#include <ATen/ops/tensor_split.h>
+#include <ATen/ops/tensordot.h>
+#include <ATen/ops/thnn_conv2d.h>
+#include <ATen/ops/threshold.h>
+#include <ATen/ops/threshold_backward.h>
+#include <ATen/ops/tile.h>
+#include <ATen/ops/to.h>
+#include <ATen/ops/to_dense.h>
+#include <ATen/ops/to_dense_backward.h>
+#include <ATen/ops/to_mkldnn.h>
+#include <ATen/ops/to_mkldnn_backward.h>
+#include <ATen/ops/to_padded_tensor.h>
+#include <ATen/ops/to_sparse.h>
+#include <ATen/ops/to_sparse_bsc.h>
+#include <ATen/ops/to_sparse_bsr.h>
+#include <ATen/ops/to_sparse_csc.h>
+#include <ATen/ops/to_sparse_csr.h>
+#include <ATen/ops/topk.h>
+#include <ATen/ops/trace.h>
+#include <ATen/ops/trace_backward.h>
+#include <ATen/ops/transpose.h>
+#include <ATen/ops/transpose_copy.h>
+#include <ATen/ops/trapezoid.h>
+#include <ATen/ops/trapz.h>
+#include <ATen/ops/triangular_solve.h>
+#include <ATen/ops/tril.h>
+#include <ATen/ops/tril_indices.h>
+#include <ATen/ops/triplet_margin_loss.h>
+#include <ATen/ops/triu.h>
+#include <ATen/ops/triu_indices.h>
+#include <ATen/ops/true_divide.h>
+#include <ATen/ops/trunc.h>
+#include <ATen/ops/type_as.h>
+#include <ATen/ops/unbind.h>
+#include <ATen/ops/unbind_copy.h>
+#include <ATen/ops/unflatten.h>
+#include <ATen/ops/unflatten_dense_tensors.h>
+#include <ATen/ops/unfold.h>
+#include <ATen/ops/unfold_backward.h>
+#include <ATen/ops/unfold_copy.h>
+#include <ATen/ops/uniform.h>
+#include <ATen/ops/unique_consecutive.h>
+#include <ATen/ops/unique_dim.h>
+#include <ATen/ops/unique_dim_consecutive.h>
+#include <ATen/ops/unsafe_chunk.h>
+#include <ATen/ops/unsafe_split.h>
+#include <ATen/ops/unsafe_split_with_sizes.h>
+#include <ATen/ops/unsqueeze.h>
+#include <ATen/ops/unsqueeze_copy.h>
+#include <ATen/ops/upsample_bicubic2d.h>
+#include <ATen/ops/upsample_bicubic2d_backward.h>
+#include <ATen/ops/upsample_bilinear2d.h>
+#include <ATen/ops/upsample_bilinear2d_backward.h>
+#include <ATen/ops/upsample_linear1d.h>
+#include <ATen/ops/upsample_linear1d_backward.h>
+#include <ATen/ops/upsample_nearest1d.h>
+#include <ATen/ops/upsample_nearest1d_backward.h>
+#include <ATen/ops/upsample_nearest2d.h>
+#include <ATen/ops/upsample_nearest2d_backward.h>
+#include <ATen/ops/upsample_nearest3d.h>
+#include <ATen/ops/upsample_nearest3d_backward.h>
+#include <ATen/ops/upsample_trilinear3d.h>
+#include <ATen/ops/upsample_trilinear3d_backward.h>
+#include <ATen/ops/value_selecting_reduction_backward.h>
+#include <ATen/ops/values.h>
+#include <ATen/ops/values_copy.h>
+#include <ATen/ops/vander.h>
+#include <ATen/ops/var.h>
+#include <ATen/ops/var_mean.h>
+#include <ATen/ops/vdot.h>
+#include <ATen/ops/view.h>
+#include <ATen/ops/view_as.h>
+#include <ATen/ops/view_as_complex.h>
+#include <ATen/ops/view_as_complex_copy.h>
+#include <ATen/ops/view_as_real.h>
+#include <ATen/ops/view_as_real_copy.h>
+#include <ATen/ops/view_copy.h>
+#include <ATen/ops/vsplit.h>
+#include <ATen/ops/vstack.h>
+#include <ATen/ops/where.h>
+#include <ATen/ops/xlogy.h>
+#include <ATen/ops/xor.h>
+#include <ATen/ops/zero.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+namespace at {
+// Special C++ only overloads for std()-like functions (See gh-40287)
+// These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
+// So, for example std(0) would select the std(unbiased=False) overload
+inline Tensor var(const Tensor& self, int dim) {
+  return at::var(self, IntArrayRef{dim});
+}
+inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
+  return at::var_mean(self, IntArrayRef{dim});
+}
+inline Tensor std(const Tensor& self, int dim) {
+  return at::std(self, IntArrayRef{dim});
+}
+inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
+  return at::std_mean(self, IntArrayRef{dim});
+}
+inline int64_t numel(const Tensor& tensor) {
+  return tensor.numel();
+}
+inline int64_t size(const Tensor& tensor, int64_t dim) {
+  return tensor.size(dim);
+}
+inline int64_t stride(const Tensor& tensor, int64_t dim) {
+  return tensor.stride(dim);
+}
+inline bool is_complex(const Tensor& tensor) {
+  return tensor.is_complex();
+}
+inline bool is_floating_point(const Tensor& tensor) {
+  return tensor.is_floating_point();
+}
+inline bool is_signed(const Tensor& tensor) {
+  return tensor.is_signed();
+}
+inline bool is_inference(const Tensor& tensor) {
+  return tensor.is_inference();
+}
+inline bool _is_zerotensor(const Tensor& tensor) {
+  return tensor._is_zerotensor();
+}
+inline bool is_conj(const Tensor& tensor) {
+  return tensor.is_conj();
+}
+inline Tensor conj(const Tensor& tensor) {
+  return tensor.conj();
+}
+inline bool is_neg(const Tensor& tensor) {
+  return tensor.is_neg();
+}
+}
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/InitialTensorOptions.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/TensorOptions.h>
+namespace at {
+// Represents the initial TensorOptions, before the "defaults" are ever changed.
+// This is designed to be used in library code, where the explicit devices,
+// dtypes, etc. are known. NOTE: this is not a stable API.
+inline TensorOptions initialTensorOptions() {
+  return TensorOptions(kCPU).dtype(kFloat).layout(kStrided).requires_grad(
+      false);
+}
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/LegacyBatchedTensorImpl.h ADDED Viewed

	@@ -0,0 +1,166 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <bitset>
+#include <ATen/ArrayRef.h>
+#include <ATen/SmallVector.h>
+#include <ATen/Tensor.h>
+namespace at {
+// We assume this in a few other places in the codebase,
+// but there isn't a centralized definition.
+constexpr int64_t kVmapMaxTensorDims = 64;
+// The valid vmap levels range from [0, 64). This effectively means that we
+// support a maximum of 64 nested vmaps.
+constexpr int64_t kVmapNumLevels = 64;
+// Store this number of elements of BatchDims on the stack. Most people will
+// probably use <= 5 nested vmaps, but adjust this number as necessary.
+constexpr int64_t kBatchDimsStackSize = 5;
+// a BatchDim represents a "private" dimension on a Tensor created inside of
+// vmap. It is a (level, dim) tuple, with the `dim` indicating which dimension
+// is being vmap'ed over and the `level` being an identifier for which vmap
+// said dimension was created inside. The `dim` corresponds to a "physical
+// dim" - it is a dimension index on the underlying physical tensor that is
+// being vmapped over.
+struct BatchDim {
+  BatchDim(int64_t level, int64_t dim) : dim_(dim), level_(level) {}
+  int64_t dim() const {
+    return dim_;
+  }
+  int64_t level() const {
+    return level_;
+  }
+ private:
+  int64_t dim_;
+  int64_t level_;
+};
+using BatchDims = SmallVector<BatchDim, kBatchDimsStackSize>;
+using BatchDimsRef = ArrayRef<BatchDim>;
+// A BatchedTensorImpl holds an underlying Tensor and a list of BatchDim
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+//
+// The batch dimensions are treated as being "private"; they are not
+// user-visible. For example, in the following Tensor,
+//    bt = BatchedTensorImpl(ones(2, 3, 5, 7), [(lvl=1, dim=0), (lvl=2, dim=1)])
+// dimensions 0 and 1 are batch dimensions.
+//
+// bt.sizes() returns (5, 7); bt.sum(0) performs a reduction over the (public)
+// dim 0, which is equivalent to dim 3 in the underlying ones(2, 3, 5, 7)
+// tensor.
+struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
+  explicit BatchedTensorImpl(Tensor value, BatchDims bdims);
+  // Returns a reference to BatchDims that represent which dimensions of this
+  // tensor are private.
+  BatchDimsRef bdims() const {
+    return bdims_;
+  }
+  // BatchedTensorImpl wraps a Tensor
+  const Tensor& value() const {
+    return value_;
+  }
+  // Given a public dimension index, return the dimension index in the
+  // underlying value() tensor. For example, if we have
+  //    bt = BatchedTensorImpl(ones(2, 3, 5, 7), [(lvl=1, dim=0), (lvl=2,
+  //    dim=2)])
+  // bt.actualDim(0) -> 1
+  // bt.actualDim(1) -> 3
+  // bt.actualDim(2) -> Error
+  int64_t actualDim(int64_t dim, bool wrap_dim = true) const;
+  // We have to override this because we opted into CustomStrides
+  IntArrayRef strides_custom() const override;
+  // Override a bunch of methods inherited from TensorImpl to return error
+  // messages.
+  c10::SymBool sym_is_contiguous_custom(
+      at::MemoryFormat memory_format) const override;
+  void set_size(int64_t dim, int64_t new_size) override;
+  void set_stride(int64_t dim, int64_t new_stride) override;
+  void set_storage_offset(int64_t storage_offset) override;
+#ifdef DEBUG
+  bool has_storage() const override;
+#endif
+ private:
+  // see NOTE: [BatchedTensorImpl levels invariant]
+  void checkInvariants() const;
+  const char* tensorimpl_type_name() const override;
+  Tensor value_;
+  // Note: [BatchedTensorImpl levels invariant]
+  // There is an invariant that the BatchDims must be stored in increasing
+  // `level` order. That is, for i < j, bdims_[i].level must be less than
+  // bdims_[j].level.
+  BatchDims bdims_;
+};
+// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a
+// BatchedTensorImpl.
+inline bool isBatchedTensor(const Tensor& tensor) {
+  return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::Batched);
+}
+// It is unsafe to call this on a Tensor that is not backed by a
+// BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible.
+inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) {
+  return static_cast<BatchedTensorImpl*>(tensor.unsafeGetTensorImpl());
+}
+inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) {
+  if (!isBatchedTensor(tensor)) {
+    return nullptr;
+  }
+  return unsafeGetBatchedImpl(tensor);
+}
+// Returns a bitset. If bit i is set, then that means dim i is a batchdim.
+inline std::bitset<kVmapMaxTensorDims> createBatchDimBitset(
+    BatchDimsRef bdims) {
+  std::bitset<kVmapMaxTensorDims> is_bdim;
+  for (const auto& bdim : bdims) {
+    is_bdim.set(bdim.dim());
+  }
+  return is_bdim;
+}
+// Creates a bitset for all of the levels present in `bdims`
+inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
+  std::bitset<kVmapNumLevels> result;
+  for (const auto& bdim : bdims) {
+    result.set(bdim.level());
+  }
+  return result;
+}
+inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
+  return out;
+}
+// Use this to construct a BatchedTensor from a regular Tensor
+TORCH_API Tensor makeBatched(Tensor tensor, BatchDims bdims);
+// Adds a batch dim to `tensor`, returning a BatchedTensor
+TORCH_API Tensor addBatchDim(Tensor tensor, int64_t level, int64_t dim);
+// Checks if an inplace operation on self and other is "vmap compatible".
+// See NOTE: [vmap-incompatible in-place operations] for the definition of this.
+TORCH_API bool inplaceIsVmapCompatible(const Tensor& self, const Tensor& other);
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/LegacyVmapMode.h ADDED Viewed

	@@ -0,0 +1,31 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/impl/LocalDispatchKeySet.h>
+namespace at::impl {
+// VmapMode contains a thread local count of how many nested vmaps
+// we are currently inside. That number is known as the `vmap level`.
+// VmapMode is used in the implementation of the Python `torch.vmap` API.
+//
+// NOTE: this is NOT the c++ api for torch.vmap. That doesn't exist yet.
+struct TORCH_API VmapMode {
+  // Returns the vmap level, aka the count of how many nested vmaps we're in.
+  static int64_t current_vmap_level();
+  // Increment the count of nested vmaps. If this causes the vmap level to be
+  // greater than 0, then it enables DispatchKey::VmapMode on all tensors.
+  static int64_t increment_nesting();
+  // Decrements the count of nested vmaps. If this causes the vmap level to be
+  // equal to 0, then it disables DispatchKey::VmapMode on all tensors.
+  static int64_t decrement_nesting();
+};
+} // namespace at::impl
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/LegacyVmapTransforms.h ADDED Viewed

	@@ -0,0 +1,188 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/LegacyBatchedTensorImpl.h>
+#include <ATen/core/IListRef.h>
+namespace at {
+// This file contains abstractions used for transforming *logical* vmap
+// arguments into *physical* arguments. (Keep reading for definitions of these
+// terms).
+// NOTE: [Logical vs physical args]
+// Consider the following vmap.
+//   vmap(vmap(func, in_dims=(2,)), in_dims=(0,))(torch.ones(2, 3, 4))
+// This would produce a BatchedTensor wrapping a Tensor of size [2, 3, 4],
+// with batch dims 0 and 2:
+//   BatchedTensor(ones(2, 3, 4), bdims=[(lvl=1,dim=0),(lvl=2,dim=2)])
+//
+// We say the *logical* view of the tensor has size [3] -- tensors inside
+// `func` appear to have size [3].
+// However, the *physical* underlying tensor (the one passed to vmap) has size
+// [2, 3, 4].
+//
+// This notion of logical vs physical also extends to non-tensor arguments.
+// Consider the previous tensor; let's assume the user called
+// `torch.sum(tensor, dim=0)` inside of `func`. Then the logical
+// dimension they are reducing over is dim 0 but the physical dim is dim 1
+// (the first non-batch dimension)
+// Forward declared; see NOTE: [What is a VmapPhysicalView?]
+struct VmapPhysicalView;
+// Most PyTorch operators take 4 or fewer inputs.
+constexpr int64_t kVmapTransformStaticInputSize = 4;
+using VmapPhysicalViewVec =
+    SmallVector<VmapPhysicalView, kVmapTransformStaticInputSize>;
+// Pytorch generally advertises good performance for <= 5 dims.
+// (see ATen/core/DimVector.h). We add a few extra dims (~3) for vmap
+// dimensions to get 8. Adjust this number as necessary
+constexpr int64_t kVmapStaticDimVecSize = 8;
+using VmapDimVector = SmallVector<int64_t, kVmapStaticDimVecSize>;
+using VmapSymDimVector = SmallVector<c10::SymInt, kVmapStaticDimVecSize>;
+// NOTE: [What is an VmapTransform?]
+// An *VmapTransform* converts logical views of tensors to physical views.
+//
+// Batching rules use VmapTransforms to convert logical arguments to
+// physical arguments, then call one or more at:: operator that handles the
+// physical arguments, and then converts the physical result back to a logical
+// argument.
+// VmapTransform for operators that take tensors with multiple batch dims.
+// Given one or more logical views on Tensors, `logicalToPhysical`
+// permutes all of the batch dims to the front of the tensor, aligns
+// and expands the batch dims to match each other (according to their `level`),
+// and returns a VmapPhysicalView on the tensor(s).
+struct TORCH_API MultiBatchVmapTransform {
+  static VmapPhysicalView logicalToPhysical(const Tensor& logical_tensor);
+  static VmapPhysicalViewVec logicalToPhysical(ITensorListRef logical_tensors);
+};
+// VmapTransform for operators that broadcast all inputs.
+// Given some logical views on Tensors, `logicalToPhysical`:
+// - permutes all of the batch dims to the front of the tensors
+// - aligns all the batch dims to the collective levels of all of the tensors.
+//   If a tensor does not have a batch dim for a vmap level, then it receives
+//   a size-one dimension for said level.
+// - aligns the non-batch dims to have the same dimensionality, adding extra
+//   size-1 dimensions in between the batch dimensions and the non-batch
+//   dimensions so that the batch dimensions are lined up from the right.
+//
+// For example: given inputs of size (B, 2) and (B, 3, 2) where B is the batch
+// dimension, BroadcastingVmapTransform returns VmapPhysicalViews that wrap
+// tensors of size (B, 1, 2) and (B, 3, 2).
+//
+// Given inputs of size (B, 2) and (2,), BroadcastingVmapTransform returns
+// VmapPhysicalViews wrapping tensors of size (B, 2) and (1, 2). We don't
+// actually *need* to return a tensor of size (1, 2) for the second tensor
+// because the broadcasting operation takes care of that for us, but we do
+// it anyways to keep things simple.
+struct TORCH_API BroadcastingVmapTransform {
+  static VmapPhysicalViewVec logicalToPhysical(TensorList logical_tensors);
+};
+// Forward declared, if you're reading this file head to toe, don't worry about
+// it yet.
+struct VmapPhysicalToLogicalMap;
+// NOTE: [What is a VmapPhysicalView?]
+// VmapPhysicalView represents a physical view on a Tensor.
+//
+// One can use it to further convert logical dimension indices, logical shapes,
+// and more to their physical variants, or convert a new (physical) tensor into
+// a logical BatchedTensor. (TODO(rzou): some of these are not yet implemented).
+//
+// VmapPhysicalView stores a physical tensor with all of its batch dimensions at
+// the front and some levels that correspond to said batch dimensions.
+//
+// The levels bitset specifies which vmap levels correspond to the batch
+// dimensions at the front of the tensor. In particular, the number of set bits
+// corresponds to the number of batch dimensions on `tensor` and the rightmost
+// bit of `levels` specifies the maximum number of nested vmaps we are in at
+// this point in time.
+// For example, given:
+//   physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5, 6), levels={1, 3})
+//
+// Rightmost bit of `levels` is 3 indicating the number of nested vmaps less
+// than or equal to 3.
+//   bitset: 010100
+//              ^
+//              |
+//   levels: 012345
+struct TORCH_API VmapPhysicalView {
+  VmapPhysicalView(Tensor&& tensor, std::bitset<kVmapNumLevels> levels)
+      : levels_(levels), tensor_(std::move(tensor)) {
+    TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor_));
+  }
+  Tensor& tensor() {
+    return tensor_;
+  }
+  const Tensor& tensor() const {
+    return tensor_;
+  }
+  // Maps logical dim indices to physical dim indices. Also does dim wrapping.
+  //
+  // For example, given:
+  //   physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5), levels={1, 3})
+  //
+  // Then physical_view.getPhysicalDims({0, 1}) returns {2, 3}.
+  // This is because the size of levels tell us that the first two dimensions
+  // of `tensor_` are batch dimensions, so a logical dim of `n` is actually
+  // a physical dim of `n + 2`.
+  VmapDimVector getPhysicalDims(OptionalIntArrayRef logical_dims) const;
+  int64_t getPhysicalDim(int64_t logical_dim) const;
+  // Returns a VmapPhysicalToLogicalMap object. This can be used for
+  // mapping a physical tensor to a new logical tensor (BatchedTensor)
+  VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;
+  // Maps a logical shape to a physical shape by prepending the batch
+  // sizes to the logical shape.
+  VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;
+  int64_t numBatchDims() const;
+ private:
+  int64_t numLogicalDims() const;
+  std::bitset<kVmapNumLevels> levels_;
+  Tensor tensor_;
+};
+// Convenience struct used for mapping a physical tensor (a non-BatchedTensor)
+// to a logical one (BatchedTensor). It holds some levels that are used to do
+// the mapping and assumes that the batch dimensions in the physical tensor all
+// occur at the front of the tensor.
+struct TORCH_API VmapPhysicalToLogicalMap {
+  VmapPhysicalToLogicalMap(std::bitset<kVmapNumLevels> levels)
+      : levels_(levels) {}
+  // Maps a physical tensor to a new logical tensor (BatchedTensor).
+  // Assumes that all of the "batch dimensions" are at the front
+  // of the physical tensor. For example, given:
+  // - x = rank-4 Tensor with size 2, 3, 5, 7
+  // - levels = (2, 4)
+  // Returns:
+  // - BatchedTensor(x, bdims=[(dim=0,lvl=2), (dim=1, lvl=4)])
+  Tensor apply(const Tensor& physical_tensor) const;
+  // Given a vector of physical tensors,
+  // 1. maps each tensor to a new logical tensor. Assumes that all of the
+  //    "batch dimensions" are at the front of the physical tensors.
+  // 2. stores the new logical tensors back into the passed-in vector. This is
+  //    to avoid additional dynamic allocations.
+  void applyInplace(std::vector<Tensor>& physical_tensors) const;
+  std::bitset<kVmapNumLevels> levels_;
+};
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/MethodOperators.h ADDED Viewed

	@@ -0,0 +1,449 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from MethodOperators.h
+#ifdef TORCH_ASSERT_NO_OPERATORS
+#error This change adds a dependency on native_functions.yaml,             \
+  meaning the file will need to be re-compiled every time an operator      \
+  is changed or added. Consider if your change would be better placed in   \
+  another file, or if a more specific header might achieve the same goal.  \
+  See NOTE: [Tensor vs. TensorBase]
+#endif
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/ops/_addmm_activation_ops.h>
+#include <ATen/ops/_autocast_to_full_precision_ops.h>
+#include <ATen/ops/_autocast_to_reduced_precision_ops.h>
+#include <ATen/ops/_backward_ops.h>
+#include <ATen/ops/_coalesced_ops.h>
+#include <ATen/ops/_conj_ops.h>
+#include <ATen/ops/_conj_physical_ops.h>
+#include <ATen/ops/_dimI_ops.h>
+#include <ATen/ops/_dimV_ops.h>
+#include <ATen/ops/_fw_primal_ops.h>
+#include <ATen/ops/_indices_ops.h>
+#include <ATen/ops/_is_all_true_ops.h>
+#include <ATen/ops/_is_any_true_ops.h>
+#include <ATen/ops/_is_zerotensor_ops.h>
+#include <ATen/ops/_lazy_clone_ops.h>
+#include <ATen/ops/_neg_view_ops.h>
+#include <ATen/ops/_nested_tensor_size_ops.h>
+#include <ATen/ops/_nested_tensor_storage_offsets_ops.h>
+#include <ATen/ops/_nested_tensor_strides_ops.h>
+#include <ATen/ops/_nnz_ops.h>
+#include <ATen/ops/_reshape_alias_ops.h>
+#include <ATen/ops/_sparse_mask_projection_ops.h>
+#include <ATen/ops/_to_dense_ops.h>
+#include <ATen/ops/_to_sparse_bsc_ops.h>
+#include <ATen/ops/_to_sparse_bsr_ops.h>
+#include <ATen/ops/_to_sparse_csc_ops.h>
+#include <ATen/ops/_to_sparse_csr_ops.h>
+#include <ATen/ops/_to_sparse_ops.h>
+#include <ATen/ops/_values_ops.h>
+#include <ATen/ops/_version_ops.h>
+#include <ATen/ops/abs_ops.h>
+#include <ATen/ops/absolute_ops.h>
+#include <ATen/ops/acos_ops.h>
+#include <ATen/ops/acosh_ops.h>
+#include <ATen/ops/add_ops.h>
+#include <ATen/ops/addbmm_ops.h>
+#include <ATen/ops/addcdiv_ops.h>
+#include <ATen/ops/addcmul_ops.h>
+#include <ATen/ops/addmm_ops.h>
+#include <ATen/ops/addmv_ops.h>
+#include <ATen/ops/addr_ops.h>
+#include <ATen/ops/adjoint_ops.h>
+#include <ATen/ops/alias_ops.h>
+#include <ATen/ops/align_as_ops.h>
+#include <ATen/ops/align_to_ops.h>
+#include <ATen/ops/all_ops.h>
+#include <ATen/ops/allclose_ops.h>
+#include <ATen/ops/amax_ops.h>
+#include <ATen/ops/amin_ops.h>
+#include <ATen/ops/aminmax_ops.h>
+#include <ATen/ops/and_ops.h>
+#include <ATen/ops/angle_ops.h>
+#include <ATen/ops/any_ops.h>
+#include <ATen/ops/arccos_ops.h>
+#include <ATen/ops/arccosh_ops.h>
+#include <ATen/ops/arcsin_ops.h>
+#include <ATen/ops/arcsinh_ops.h>
+#include <ATen/ops/arctan2_ops.h>
+#include <ATen/ops/arctan_ops.h>
+#include <ATen/ops/arctanh_ops.h>
+#include <ATen/ops/argmax_ops.h>
+#include <ATen/ops/argmin_ops.h>
+#include <ATen/ops/argsort_ops.h>
+#include <ATen/ops/argwhere_ops.h>
+#include <ATen/ops/as_strided_ops.h>
+#include <ATen/ops/as_strided_scatter_ops.h>
+#include <ATen/ops/asin_ops.h>
+#include <ATen/ops/asinh_ops.h>
+#include <ATen/ops/atan2_ops.h>
+#include <ATen/ops/atan_ops.h>
+#include <ATen/ops/atanh_ops.h>
+#include <ATen/ops/baddbmm_ops.h>
+#include <ATen/ops/bernoulli_ops.h>
+#include <ATen/ops/bincount_ops.h>
+#include <ATen/ops/bitwise_and_ops.h>
+#include <ATen/ops/bitwise_left_shift_ops.h>
+#include <ATen/ops/bitwise_not_ops.h>
+#include <ATen/ops/bitwise_or_ops.h>
+#include <ATen/ops/bitwise_right_shift_ops.h>
+#include <ATen/ops/bitwise_xor_ops.h>
+#include <ATen/ops/bmm_ops.h>
+#include <ATen/ops/broadcast_to_ops.h>
+#include <ATen/ops/cauchy_ops.h>
+#include <ATen/ops/ccol_indices_ops.h>
+#include <ATen/ops/ceil_ops.h>
+#include <ATen/ops/chalf_ops.h>
+#include <ATen/ops/cholesky_inverse_ops.h>
+#include <ATen/ops/cholesky_ops.h>
+#include <ATen/ops/cholesky_solve_ops.h>
+#include <ATen/ops/chunk_ops.h>
+#include <ATen/ops/clamp_max_ops.h>
+#include <ATen/ops/clamp_min_ops.h>
+#include <ATen/ops/clamp_ops.h>
+#include <ATen/ops/clip_ops.h>
+#include <ATen/ops/clone_ops.h>
+#include <ATen/ops/coalesce_ops.h>
+#include <ATen/ops/col_indices_ops.h>
+#include <ATen/ops/conj_ops.h>
+#include <ATen/ops/conj_physical_ops.h>
+#include <ATen/ops/contiguous_ops.h>
+#include <ATen/ops/copy_ops.h>
+#include <ATen/ops/copysign_ops.h>
+#include <ATen/ops/corrcoef_ops.h>
+#include <ATen/ops/cos_ops.h>
+#include <ATen/ops/cosh_ops.h>
+#include <ATen/ops/count_nonzero_ops.h>
+#include <ATen/ops/cov_ops.h>
+#include <ATen/ops/cross_ops.h>
+#include <ATen/ops/crow_indices_ops.h>
+#include <ATen/ops/cummax_ops.h>
+#include <ATen/ops/cummin_ops.h>
+#include <ATen/ops/cumprod_ops.h>
+#include <ATen/ops/cumsum_ops.h>
+#include <ATen/ops/data_ops.h>
+#include <ATen/ops/deg2rad_ops.h>
+#include <ATen/ops/dense_dim_ops.h>
+#include <ATen/ops/dequantize_ops.h>
+#include <ATen/ops/det_ops.h>
+#include <ATen/ops/detach_ops.h>
+#include <ATen/ops/diag_embed_ops.h>
+#include <ATen/ops/diag_ops.h>
+#include <ATen/ops/diagflat_ops.h>
+#include <ATen/ops/diagonal_ops.h>
+#include <ATen/ops/diagonal_scatter_ops.h>
+#include <ATen/ops/diff_ops.h>
+#include <ATen/ops/digamma_ops.h>
+#include <ATen/ops/dist_ops.h>
+#include <ATen/ops/div_ops.h>
+#include <ATen/ops/divide_ops.h>
+#include <ATen/ops/dot_ops.h>
+#include <ATen/ops/dsplit_ops.h>
+#include <ATen/ops/eq_ops.h>
+#include <ATen/ops/equal_ops.h>
+#include <ATen/ops/erf_ops.h>
+#include <ATen/ops/erfc_ops.h>
+#include <ATen/ops/erfinv_ops.h>
+#include <ATen/ops/exp2_ops.h>
+#include <ATen/ops/exp_ops.h>
+#include <ATen/ops/expand_as_ops.h>
+#include <ATen/ops/expand_ops.h>
+#include <ATen/ops/expm1_ops.h>
+#include <ATen/ops/exponential_ops.h>
+#include <ATen/ops/fill_diagonal_ops.h>
+#include <ATen/ops/fill_ops.h>
+#include <ATen/ops/fix_ops.h>
+#include <ATen/ops/flatten_ops.h>
+#include <ATen/ops/flip_ops.h>
+#include <ATen/ops/fliplr_ops.h>
+#include <ATen/ops/flipud_ops.h>
+#include <ATen/ops/float_power_ops.h>
+#include <ATen/ops/floor_divide_ops.h>
+#include <ATen/ops/floor_ops.h>
+#include <ATen/ops/fmax_ops.h>
+#include <ATen/ops/fmin_ops.h>
+#include <ATen/ops/fmod_ops.h>
+#include <ATen/ops/frac_ops.h>
+#include <ATen/ops/frexp_ops.h>
+#include <ATen/ops/gather_ops.h>
+#include <ATen/ops/gcd_ops.h>
+#include <ATen/ops/ge_ops.h>
+#include <ATen/ops/geometric_ops.h>
+#include <ATen/ops/geqrf_ops.h>
+#include <ATen/ops/ger_ops.h>
+#include <ATen/ops/greater_equal_ops.h>
+#include <ATen/ops/greater_ops.h>
+#include <ATen/ops/gt_ops.h>
+#include <ATen/ops/hardshrink_backward_ops.h>
+#include <ATen/ops/hardshrink_ops.h>
+#include <ATen/ops/hash_tensor_ops.h>
+#include <ATen/ops/heaviside_ops.h>
+#include <ATen/ops/histc_ops.h>
+#include <ATen/ops/histogram_ops.h>
+#include <ATen/ops/hsplit_ops.h>
+#include <ATen/ops/hypot_ops.h>
+#include <ATen/ops/i0_ops.h>
+#include <ATen/ops/igamma_ops.h>
+#include <ATen/ops/igammac_ops.h>
+#include <ATen/ops/index_add_ops.h>
+#include <ATen/ops/index_copy_ops.h>
+#include <ATen/ops/index_fill_ops.h>
+#include <ATen/ops/index_ops.h>
+#include <ATen/ops/index_put_ops.h>
+#include <ATen/ops/index_reduce_ops.h>
+#include <ATen/ops/index_select_ops.h>
+#include <ATen/ops/indices_ops.h>
+#include <ATen/ops/inner_ops.h>
+#include <ATen/ops/int_repr_ops.h>
+#include <ATen/ops/inverse_ops.h>
+#include <ATen/ops/is_coalesced_ops.h>
+#include <ATen/ops/is_complex_ops.h>
+#include <ATen/ops/is_conj_ops.h>
+#include <ATen/ops/is_distributed_ops.h>
+#include <ATen/ops/is_floating_point_ops.h>
+#include <ATen/ops/is_inference_ops.h>
+#include <ATen/ops/is_leaf_ops.h>
+#include <ATen/ops/is_neg_ops.h>
+#include <ATen/ops/is_nonzero_ops.h>
+#include <ATen/ops/is_pinned_ops.h>
+#include <ATen/ops/is_same_size_ops.h>
+#include <ATen/ops/is_set_to_ops.h>
+#include <ATen/ops/is_signed_ops.h>
+#include <ATen/ops/isclose_ops.h>
+#include <ATen/ops/isfinite_ops.h>
+#include <ATen/ops/isinf_ops.h>
+#include <ATen/ops/isnan_ops.h>
+#include <ATen/ops/isneginf_ops.h>
+#include <ATen/ops/isposinf_ops.h>
+#include <ATen/ops/isreal_ops.h>
+#include <ATen/ops/istft_ops.h>
+#include <ATen/ops/item_ops.h>
+#include <ATen/ops/kron_ops.h>
+#include <ATen/ops/kthvalue_ops.h>
+#include <ATen/ops/lcm_ops.h>
+#include <ATen/ops/ldexp_ops.h>
+#include <ATen/ops/le_ops.h>
+#include <ATen/ops/lerp_ops.h>
+#include <ATen/ops/less_equal_ops.h>
+#include <ATen/ops/less_ops.h>
+#include <ATen/ops/lgamma_ops.h>
+#include <ATen/ops/log10_ops.h>
+#include <ATen/ops/log1p_ops.h>
+#include <ATen/ops/log2_ops.h>
+#include <ATen/ops/log_normal_ops.h>
+#include <ATen/ops/log_ops.h>
+#include <ATen/ops/log_softmax_ops.h>
+#include <ATen/ops/logaddexp2_ops.h>
+#include <ATen/ops/logaddexp_ops.h>
+#include <ATen/ops/logcumsumexp_ops.h>
+#include <ATen/ops/logdet_ops.h>
+#include <ATen/ops/logical_and_ops.h>
+#include <ATen/ops/logical_not_ops.h>
+#include <ATen/ops/logical_or_ops.h>
+#include <ATen/ops/logical_xor_ops.h>
+#include <ATen/ops/logit_ops.h>
+#include <ATen/ops/logsumexp_ops.h>
+#include <ATen/ops/lshift_ops.h>
+#include <ATen/ops/lt_ops.h>
+#include <ATen/ops/lu_solve_ops.h>
+#include <ATen/ops/mH_ops.h>
+#include <ATen/ops/mT_ops.h>
+#include <ATen/ops/masked_fill_ops.h>
+#include <ATen/ops/masked_scatter_ops.h>
+#include <ATen/ops/masked_select_ops.h>
+#include <ATen/ops/matmul_ops.h>
+#include <ATen/ops/matrix_H_ops.h>
+#include <ATen/ops/matrix_exp_ops.h>
+#include <ATen/ops/matrix_power_ops.h>
+#include <ATen/ops/max_ops.h>
+#include <ATen/ops/maximum_ops.h>
+#include <ATen/ops/mean_ops.h>
+#include <ATen/ops/median_ops.h>
+#include <ATen/ops/min_ops.h>
+#include <ATen/ops/minimum_ops.h>
+#include <ATen/ops/mm_ops.h>
+#include <ATen/ops/mode_ops.h>
+#include <ATen/ops/moveaxis_ops.h>
+#include <ATen/ops/movedim_ops.h>
+#include <ATen/ops/msort_ops.h>
+#include <ATen/ops/mul_ops.h>
+#include <ATen/ops/multinomial_ops.h>
+#include <ATen/ops/multiply_ops.h>
+#include <ATen/ops/mv_ops.h>
+#include <ATen/ops/mvlgamma_ops.h>
+#include <ATen/ops/nan_to_num_ops.h>
+#include <ATen/ops/nanmean_ops.h>
+#include <ATen/ops/nanmedian_ops.h>
+#include <ATen/ops/nanquantile_ops.h>
+#include <ATen/ops/nansum_ops.h>
+#include <ATen/ops/narrow_copy_ops.h>
+#include <ATen/ops/narrow_ops.h>
+#include <ATen/ops/ne_ops.h>
+#include <ATen/ops/neg_ops.h>
+#include <ATen/ops/negative_ops.h>
+#include <ATen/ops/new_empty_ops.h>
+#include <ATen/ops/new_empty_strided_ops.h>
+#include <ATen/ops/new_full_ops.h>
+#include <ATen/ops/new_ones_ops.h>
+#include <ATen/ops/new_zeros_ops.h>
+#include <ATen/ops/nextafter_ops.h>
+#include <ATen/ops/nonzero_numpy_ops.h>
+#include <ATen/ops/nonzero_ops.h>
+#include <ATen/ops/nonzero_static_ops.h>
+#include <ATen/ops/norm_ops.h>
+#include <ATen/ops/normal_ops.h>
+#include <ATen/ops/not_equal_ops.h>
+#include <ATen/ops/numpy_T_ops.h>
+#include <ATen/ops/or_ops.h>
+#include <ATen/ops/orgqr_ops.h>
+#include <ATen/ops/ormqr_ops.h>
+#include <ATen/ops/outer_ops.h>
+#include <ATen/ops/output_nr_ops.h>
+#include <ATen/ops/permute_ops.h>
+#include <ATen/ops/pin_memory_ops.h>
+#include <ATen/ops/pinverse_ops.h>
+#include <ATen/ops/polygamma_ops.h>
+#include <ATen/ops/positive_ops.h>
+#include <ATen/ops/pow_ops.h>
+#include <ATen/ops/prelu_ops.h>
+#include <ATen/ops/prod_ops.h>
+#include <ATen/ops/put_ops.h>
+#include <ATen/ops/q_per_channel_axis_ops.h>
+#include <ATen/ops/q_per_channel_scales_ops.h>
+#include <ATen/ops/q_per_channel_zero_points_ops.h>
+#include <ATen/ops/q_scale_ops.h>
+#include <ATen/ops/q_zero_point_ops.h>
+#include <ATen/ops/qr_ops.h>
+#include <ATen/ops/qscheme_ops.h>
+#include <ATen/ops/quantile_ops.h>
+#include <ATen/ops/rad2deg_ops.h>
+#include <ATen/ops/random_ops.h>
+#include <ATen/ops/ravel_ops.h>
+#include <ATen/ops/reciprocal_ops.h>
+#include <ATen/ops/record_stream_ops.h>
+#include <ATen/ops/refine_names_ops.h>
+#include <ATen/ops/relu_ops.h>
+#include <ATen/ops/remainder_ops.h>
+#include <ATen/ops/rename_ops.h>
+#include <ATen/ops/renorm_ops.h>
+#include <ATen/ops/repeat_interleave_ops.h>
+#include <ATen/ops/repeat_ops.h>
+#include <ATen/ops/requires_grad_ops.h>
+#include <ATen/ops/reshape_as_ops.h>
+#include <ATen/ops/reshape_ops.h>
+#include <ATen/ops/resize_as_ops.h>
+#include <ATen/ops/resize_as_sparse_ops.h>
+#include <ATen/ops/resize_ops.h>
+#include <ATen/ops/resolve_conj_ops.h>
+#include <ATen/ops/resolve_neg_ops.h>
+#include <ATen/ops/retain_grad_ops.h>
+#include <ATen/ops/retains_grad_ops.h>
+#include <ATen/ops/roll_ops.h>
+#include <ATen/ops/rot90_ops.h>
+#include <ATen/ops/round_ops.h>
+#include <ATen/ops/row_indices_ops.h>
+#include <ATen/ops/rshift_ops.h>
+#include <ATen/ops/rsqrt_ops.h>
+#include <ATen/ops/scatter_add_ops.h>
+#include <ATen/ops/scatter_ops.h>
+#include <ATen/ops/scatter_reduce_ops.h>
+#include <ATen/ops/select_ops.h>
+#include <ATen/ops/select_scatter_ops.h>
+#include <ATen/ops/set_data_ops.h>
+#include <ATen/ops/set_ops.h>
+#include <ATen/ops/sgn_ops.h>
+#include <ATen/ops/sigmoid_ops.h>
+#include <ATen/ops/sign_ops.h>
+#include <ATen/ops/signbit_ops.h>
+#include <ATen/ops/sin_ops.h>
+#include <ATen/ops/sinc_ops.h>
+#include <ATen/ops/sinh_ops.h>
+#include <ATen/ops/size_ops.h>
+#include <ATen/ops/slice_inverse_ops.h>
+#include <ATen/ops/slice_ops.h>
+#include <ATen/ops/slice_scatter_ops.h>
+#include <ATen/ops/slogdet_ops.h>
+#include <ATen/ops/smm_ops.h>
+#include <ATen/ops/softmax_ops.h>
+#include <ATen/ops/sort_ops.h>
+#include <ATen/ops/sparse_dim_ops.h>
+#include <ATen/ops/sparse_mask_ops.h>
+#include <ATen/ops/sparse_resize_and_clear_ops.h>
+#include <ATen/ops/sparse_resize_ops.h>
+#include <ATen/ops/split_ops.h>
+#include <ATen/ops/split_with_sizes_ops.h>
+#include <ATen/ops/sqrt_ops.h>
+#include <ATen/ops/square_ops.h>
+#include <ATen/ops/squeeze_ops.h>
+#include <ATen/ops/sspaddmm_ops.h>
+#include <ATen/ops/std_ops.h>
+#include <ATen/ops/stft_ops.h>
+#include <ATen/ops/stride_ops.h>
+#include <ATen/ops/sub_ops.h>
+#include <ATen/ops/subtract_ops.h>
+#include <ATen/ops/sum_ops.h>
+#include <ATen/ops/sum_to_size_ops.h>
+#include <ATen/ops/svd_ops.h>
+#include <ATen/ops/swapaxes_ops.h>
+#include <ATen/ops/swapdims_ops.h>
+#include <ATen/ops/t_ops.h>
+#include <ATen/ops/take_along_dim_ops.h>
+#include <ATen/ops/take_ops.h>
+#include <ATen/ops/tan_ops.h>
+#include <ATen/ops/tanh_ops.h>
+#include <ATen/ops/tensor_split_ops.h>
+#include <ATen/ops/tile_ops.h>
+#include <ATen/ops/to_dense_ops.h>
+#include <ATen/ops/to_mkldnn_ops.h>
+#include <ATen/ops/to_ops.h>
+#include <ATen/ops/to_padded_tensor_ops.h>
+#include <ATen/ops/to_sparse_bsc_ops.h>
+#include <ATen/ops/to_sparse_bsr_ops.h>
+#include <ATen/ops/to_sparse_csc_ops.h>
+#include <ATen/ops/to_sparse_csr_ops.h>
+#include <ATen/ops/to_sparse_ops.h>
+#include <ATen/ops/topk_ops.h>
+#include <ATen/ops/trace_ops.h>
+#include <ATen/ops/transpose_ops.h>
+#include <ATen/ops/triangular_solve_ops.h>
+#include <ATen/ops/tril_ops.h>
+#include <ATen/ops/triu_ops.h>
+#include <ATen/ops/true_divide_ops.h>
+#include <ATen/ops/trunc_ops.h>
+#include <ATen/ops/type_as_ops.h>
+#include <ATen/ops/unbind_ops.h>
+#include <ATen/ops/unflatten_ops.h>
+#include <ATen/ops/unfold_ops.h>
+#include <ATen/ops/uniform_ops.h>
+#include <ATen/ops/unsafe_chunk_ops.h>
+#include <ATen/ops/unsafe_split_ops.h>
+#include <ATen/ops/unsafe_split_with_sizes_ops.h>
+#include <ATen/ops/unsqueeze_ops.h>
+#include <ATen/ops/values_ops.h>
+#include <ATen/ops/var_ops.h>
+#include <ATen/ops/vdot_ops.h>
+#include <ATen/ops/view_as_ops.h>
+#include <ATen/ops/view_ops.h>
+#include <ATen/ops/vsplit_ops.h>
+#include <ATen/ops/where_ops.h>
+#include <ATen/ops/xlogy_ops.h>
+#include <ATen/ops/xor_ops.h>
+#include <ATen/ops/zero_ops.h>
+namespace at {
+namespace _ops {
+} // namespace _ops
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/NamedTensor.h ADDED Viewed

	@@ -0,0 +1,6 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#include <ATen/core/NamedTensor.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/NativeMetaFunctions.h ADDED Viewed

	@@ -0,0 +1,1352 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from NativeMetaFunctions.h
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/TensorMeta.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/ops/_adaptive_avg_pool2d_meta.h>
+#include <ATen/ops/_adaptive_avg_pool2d_backward_meta.h>
+#include <ATen/ops/_adaptive_avg_pool3d_meta.h>
+#include <ATen/ops/_adaptive_avg_pool3d_backward_meta.h>
+#include <ATen/ops/_add_batch_dim_meta.h>
+#include <ATen/ops/_add_relu_meta.h>
+#include <ATen/ops/_addmm_activation_meta.h>
+#include <ATen/ops/_aminmax_meta.h>
+#include <ATen/ops/_amp_foreach_non_finite_check_and_unscale_meta.h>
+#include <ATen/ops/_amp_update_scale_meta.h>
+#include <ATen/ops/_assert_async_meta.h>
+#include <ATen/ops/_assert_scalar_meta.h>
+#include <ATen/ops/_assert_tensor_metadata_meta.h>
+#include <ATen/ops/_autocast_to_full_precision_meta.h>
+#include <ATen/ops/_autocast_to_reduced_precision_meta.h>
+#include <ATen/ops/_backward_meta.h>
+#include <ATen/ops/_batch_norm_impl_index_meta.h>
+#include <ATen/ops/_batch_norm_impl_index_backward_meta.h>
+#include <ATen/ops/_batch_norm_no_update_meta.h>
+#include <ATen/ops/_batch_norm_with_update_meta.h>
+#include <ATen/ops/_cast_Byte_meta.h>
+#include <ATen/ops/_cast_Char_meta.h>
+#include <ATen/ops/_cast_Double_meta.h>
+#include <ATen/ops/_cast_Float_meta.h>
+#include <ATen/ops/_cast_Half_meta.h>
+#include <ATen/ops/_cast_Int_meta.h>
+#include <ATen/ops/_cast_Long_meta.h>
+#include <ATen/ops/_cast_Short_meta.h>
+#include <ATen/ops/_cdist_backward_meta.h>
+#include <ATen/ops/_cdist_forward_meta.h>
+#include <ATen/ops/_cholesky_solve_helper_meta.h>
+#include <ATen/ops/_choose_qparams_per_tensor_meta.h>
+#include <ATen/ops/_chunk_cat_meta.h>
+#include <ATen/ops/_coalesce_meta.h>
+#include <ATen/ops/_coalesced_meta.h>
+#include <ATen/ops/_compute_linear_combination_meta.h>
+#include <ATen/ops/_conj_meta.h>
+#include <ATen/ops/_conj_copy_meta.h>
+#include <ATen/ops/_conj_physical_meta.h>
+#include <ATen/ops/_conv_depthwise2d_meta.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_meta.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_meta.h>
+#include <ATen/ops/_convert_weight_to_int4pack_meta.h>
+#include <ATen/ops/_convert_weight_to_int4pack_for_cpu_meta.h>
+#include <ATen/ops/_convolution_meta.h>
+#include <ATen/ops/_convolution_double_backward_meta.h>
+#include <ATen/ops/_convolution_mode_meta.h>
+#include <ATen/ops/_copy_from_meta.h>
+#include <ATen/ops/_copy_from_and_resize_meta.h>
+#include <ATen/ops/_cslt_compress_meta.h>
+#include <ATen/ops/_cslt_sparse_mm_meta.h>
+#include <ATen/ops/_cslt_sparse_mm_search_meta.h>
+#include <ATen/ops/_ctc_loss_meta.h>
+#include <ATen/ops/_ctc_loss_backward_meta.h>
+#include <ATen/ops/_cudnn_attention_backward_meta.h>
+#include <ATen/ops/_cudnn_attention_forward_meta.h>
+#include <ATen/ops/_cudnn_ctc_loss_meta.h>
+#include <ATen/ops/_cudnn_init_dropout_state_meta.h>
+#include <ATen/ops/_cudnn_rnn_meta.h>
+#include <ATen/ops/_cudnn_rnn_backward_meta.h>
+#include <ATen/ops/_cudnn_rnn_flatten_weight_meta.h>
+#include <ATen/ops/_cufft_clear_plan_cache_meta.h>
+#include <ATen/ops/_cufft_get_plan_cache_max_size_meta.h>
+#include <ATen/ops/_cufft_get_plan_cache_size_meta.h>
+#include <ATen/ops/_cufft_set_plan_cache_max_size_meta.h>
+#include <ATen/ops/_cummax_helper_meta.h>
+#include <ATen/ops/_cummin_helper_meta.h>
+#include <ATen/ops/_debug_has_internal_overlap_meta.h>
+#include <ATen/ops/_dimI_meta.h>
+#include <ATen/ops/_dimV_meta.h>
+#include <ATen/ops/_dim_arange_meta.h>
+#include <ATen/ops/_dirichlet_grad_meta.h>
+#include <ATen/ops/_dyn_quant_matmul_4bit_meta.h>
+#include <ATen/ops/_dyn_quant_pack_4bit_weight_meta.h>
+#include <ATen/ops/_efficient_attention_backward_meta.h>
+#include <ATen/ops/_efficient_attention_forward_meta.h>
+#include <ATen/ops/_efficientzerotensor_meta.h>
+#include <ATen/ops/_embedding_bag_meta.h>
+#include <ATen/ops/_embedding_bag_backward_meta.h>
+#include <ATen/ops/_embedding_bag_dense_backward_meta.h>
+#include <ATen/ops/_embedding_bag_forward_only_meta.h>
+#include <ATen/ops/_embedding_bag_per_sample_weights_backward_meta.h>
+#include <ATen/ops/_embedding_bag_sparse_backward_meta.h>
+#include <ATen/ops/_empty_affine_quantized_meta.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized_meta.h>
+#include <ATen/ops/_euclidean_dist_meta.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_meta.h>
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_backward_meta.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_meta.h>
+#include <ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward_meta.h>
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_meta.h>
+#include <ATen/ops/_fft_c2c_meta.h>
+#include <ATen/ops/_fft_c2r_meta.h>
+#include <ATen/ops/_fft_r2c_meta.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask_meta.h>
+#include <ATen/ops/_flash_attention_backward_meta.h>
+#include <ATen/ops/_flash_attention_forward_meta.h>
+#include <ATen/ops/_foobar_meta.h>
+#include <ATen/ops/_foreach_abs_meta.h>
+#include <ATen/ops/_foreach_acos_meta.h>
+#include <ATen/ops/_foreach_add_meta.h>
+#include <ATen/ops/_foreach_addcdiv_meta.h>
+#include <ATen/ops/_foreach_addcmul_meta.h>
+#include <ATen/ops/_foreach_asin_meta.h>
+#include <ATen/ops/_foreach_atan_meta.h>
+#include <ATen/ops/_foreach_ceil_meta.h>
+#include <ATen/ops/_foreach_clamp_max_meta.h>
+#include <ATen/ops/_foreach_clamp_min_meta.h>
+#include <ATen/ops/_foreach_copy_meta.h>
+#include <ATen/ops/_foreach_cos_meta.h>
+#include <ATen/ops/_foreach_cosh_meta.h>
+#include <ATen/ops/_foreach_div_meta.h>
+#include <ATen/ops/_foreach_erf_meta.h>
+#include <ATen/ops/_foreach_erfc_meta.h>
+#include <ATen/ops/_foreach_exp_meta.h>
+#include <ATen/ops/_foreach_expm1_meta.h>
+#include <ATen/ops/_foreach_floor_meta.h>
+#include <ATen/ops/_foreach_frac_meta.h>
+#include <ATen/ops/_foreach_lerp_meta.h>
+#include <ATen/ops/_foreach_lgamma_meta.h>
+#include <ATen/ops/_foreach_log_meta.h>
+#include <ATen/ops/_foreach_log10_meta.h>
+#include <ATen/ops/_foreach_log1p_meta.h>
+#include <ATen/ops/_foreach_log2_meta.h>
+#include <ATen/ops/_foreach_max_meta.h>
+#include <ATen/ops/_foreach_maximum_meta.h>
+#include <ATen/ops/_foreach_minimum_meta.h>
+#include <ATen/ops/_foreach_mul_meta.h>
+#include <ATen/ops/_foreach_neg_meta.h>
+#include <ATen/ops/_foreach_norm_meta.h>
+#include <ATen/ops/_foreach_pow_meta.h>
+#include <ATen/ops/_foreach_reciprocal_meta.h>
+#include <ATen/ops/_foreach_round_meta.h>
+#include <ATen/ops/_foreach_rsqrt_meta.h>
+#include <ATen/ops/_foreach_sigmoid_meta.h>
+#include <ATen/ops/_foreach_sign_meta.h>
+#include <ATen/ops/_foreach_sin_meta.h>
+#include <ATen/ops/_foreach_sinh_meta.h>
+#include <ATen/ops/_foreach_sqrt_meta.h>
+#include <ATen/ops/_foreach_sub_meta.h>
+#include <ATen/ops/_foreach_tan_meta.h>
+#include <ATen/ops/_foreach_tanh_meta.h>
+#include <ATen/ops/_foreach_trunc_meta.h>
+#include <ATen/ops/_foreach_zero_meta.h>
+#include <ATen/ops/_functional_assert_async_meta.h>
+#include <ATen/ops/_functional_assert_scalar_meta.h>
+#include <ATen/ops/_functional_sym_constrain_range_meta.h>
+#include <ATen/ops/_functional_sym_constrain_range_for_size_meta.h>
+#include <ATen/ops/_fused_adagrad_meta.h>
+#include <ATen/ops/_fused_adam_meta.h>
+#include <ATen/ops/_fused_adamw_meta.h>
+#include <ATen/ops/_fused_dropout_meta.h>
+#include <ATen/ops/_fused_moving_avg_obs_fq_helper_meta.h>
+#include <ATen/ops/_fused_rms_norm_meta.h>
+#include <ATen/ops/_fused_rms_norm_backward_meta.h>
+#include <ATen/ops/_fused_sdp_choice_meta.h>
+#include <ATen/ops/_fused_sgd_meta.h>
+#include <ATen/ops/_fw_primal_meta.h>
+#include <ATen/ops/_fw_primal_copy_meta.h>
+#include <ATen/ops/_gather_sparse_backward_meta.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_meta.h>
+#include <ATen/ops/_grid_sampler_2d_cpu_fallback_backward_meta.h>
+#include <ATen/ops/_grouped_mm_meta.h>
+#include <ATen/ops/_has_compatible_shallow_copy_type_meta.h>
+#include <ATen/ops/_has_same_storage_numel_meta.h>
+#include <ATen/ops/_histogramdd_bin_edges_meta.h>
+#include <ATen/ops/_histogramdd_from_bin_cts_meta.h>
+#include <ATen/ops/_histogramdd_from_bin_tensors_meta.h>
+#include <ATen/ops/_index_put_impl_meta.h>
+#include <ATen/ops/_indices_meta.h>
+#include <ATen/ops/_indices_copy_meta.h>
+#include <ATen/ops/_int_mm_meta.h>
+#include <ATen/ops/_is_all_true_meta.h>
+#include <ATen/ops/_is_any_true_meta.h>
+#include <ATen/ops/_is_zerotensor_meta.h>
+#include <ATen/ops/_jagged_to_padded_dense_forward_meta.h>
+#include <ATen/ops/_lazy_clone_meta.h>
+#include <ATen/ops/_linalg_check_errors_meta.h>
+#include <ATen/ops/_linalg_det_meta.h>
+#include <ATen/ops/_linalg_eigh_meta.h>
+#include <ATen/ops/_linalg_eigvals_meta.h>
+#include <ATen/ops/_linalg_slogdet_meta.h>
+#include <ATen/ops/_linalg_solve_ex_meta.h>
+#include <ATen/ops/_linalg_svd_meta.h>
+#include <ATen/ops/_local_scalar_dense_meta.h>
+#include <ATen/ops/_log_softmax_meta.h>
+#include <ATen/ops/_log_softmax_backward_data_meta.h>
+#include <ATen/ops/_logcumsumexp_meta.h>
+#include <ATen/ops/_lstm_mps_meta.h>
+#include <ATen/ops/_lu_with_info_meta.h>
+#include <ATen/ops/_make_dep_token_meta.h>
+#include <ATen/ops/_make_dual_meta.h>
+#include <ATen/ops/_make_dual_copy_meta.h>
+#include <ATen/ops/_make_per_channel_quantized_tensor_meta.h>
+#include <ATen/ops/_make_per_tensor_quantized_tensor_meta.h>
+#include <ATen/ops/_masked_scale_meta.h>
+#include <ATen/ops/_masked_softmax_meta.h>
+#include <ATen/ops/_masked_softmax_backward_meta.h>
+#include <ATen/ops/_mixed_dtypes_linear_meta.h>
+#include <ATen/ops/_mkldnn_reshape_meta.h>
+#include <ATen/ops/_mkldnn_transpose_meta.h>
+#include <ATen/ops/_mps_convolution_meta.h>
+#include <ATen/ops/_mps_convolution_transpose_meta.h>
+#include <ATen/ops/_native_batch_norm_legit_meta.h>
+#include <ATen/ops/_native_batch_norm_legit_no_training_meta.h>
+#include <ATen/ops/_native_multi_head_attention_meta.h>
+#include <ATen/ops/_neg_view_meta.h>
+#include <ATen/ops/_neg_view_copy_meta.h>
+#include <ATen/ops/_nested_compute_contiguous_strides_offsets_meta.h>
+#include <ATen/ops/_nested_from_padded_meta.h>
+#include <ATen/ops/_nested_from_padded_and_nested_example_meta.h>
+#include <ATen/ops/_nested_from_padded_tensor_meta.h>
+#include <ATen/ops/_nested_get_jagged_dummy_meta.h>
+#include <ATen/ops/_nested_get_lengths_meta.h>
+#include <ATen/ops/_nested_get_max_seqlen_meta.h>
+#include <ATen/ops/_nested_get_min_seqlen_meta.h>
+#include <ATen/ops/_nested_get_offsets_meta.h>
+#include <ATen/ops/_nested_get_ragged_idx_meta.h>
+#include <ATen/ops/_nested_get_values_meta.h>
+#include <ATen/ops/_nested_get_values_copy_meta.h>
+#include <ATen/ops/_nested_select_backward_meta.h>
+#include <ATen/ops/_nested_sum_backward_meta.h>
+#include <ATen/ops/_nested_tensor_from_mask_meta.h>
+#include <ATen/ops/_nested_tensor_from_mask_left_aligned_meta.h>
+#include <ATen/ops/_nested_tensor_from_tensor_list_meta.h>
+#include <ATen/ops/_nested_tensor_size_meta.h>
+#include <ATen/ops/_nested_tensor_softmax_with_shape_meta.h>
+#include <ATen/ops/_nested_tensor_storage_offsets_meta.h>
+#include <ATen/ops/_nested_tensor_strides_meta.h>
+#include <ATen/ops/_nested_view_from_buffer_meta.h>
+#include <ATen/ops/_nested_view_from_buffer_copy_meta.h>
+#include <ATen/ops/_nested_view_from_jagged_meta.h>
+#include <ATen/ops/_nested_view_from_jagged_copy_meta.h>
+#include <ATen/ops/_new_zeros_with_same_feature_meta_meta.h>
+#include <ATen/ops/_nnpack_available_meta.h>
+#include <ATen/ops/_nnpack_spatial_convolution_meta.h>
+#include <ATen/ops/_nnz_meta.h>
+#include <ATen/ops/_pack_padded_sequence_meta.h>
+#include <ATen/ops/_pack_padded_sequence_backward_meta.h>
+#include <ATen/ops/_pad_circular_meta.h>
+#include <ATen/ops/_pad_enum_meta.h>
+#include <ATen/ops/_pad_packed_sequence_meta.h>
+#include <ATen/ops/_padded_dense_to_jagged_forward_meta.h>
+#include <ATen/ops/_pdist_backward_meta.h>
+#include <ATen/ops/_pdist_forward_meta.h>
+#include <ATen/ops/_pin_memory_meta.h>
+#include <ATen/ops/_prelu_kernel_meta.h>
+#include <ATen/ops/_prelu_kernel_backward_meta.h>
+#include <ATen/ops/_print_meta.h>
+#include <ATen/ops/_propagate_xla_data_meta.h>
+#include <ATen/ops/_remove_batch_dim_meta.h>
+#include <ATen/ops/_reshape_alias_meta.h>
+#include <ATen/ops/_reshape_alias_copy_meta.h>
+#include <ATen/ops/_reshape_copy_meta.h>
+#include <ATen/ops/_reshape_from_tensor_meta.h>
+#include <ATen/ops/_resize_output_meta.h>
+#include <ATen/ops/_rowwise_prune_meta.h>
+#include <ATen/ops/_safe_softmax_meta.h>
+#include <ATen/ops/_sample_dirichlet_meta.h>
+#include <ATen/ops/_saturate_weight_to_fp16_meta.h>
+#include <ATen/ops/_scaled_dot_product_attention_math_meta.h>
+#include <ATen/ops/_scaled_dot_product_attention_math_for_mps_meta.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention_meta.h>
+#include <ATen/ops/_scaled_dot_product_cudnn_attention_backward_meta.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_meta.h>
+#include <ATen/ops/_scaled_dot_product_efficient_attention_backward_meta.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_meta.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_backward_meta.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_meta.h>
+#include <ATen/ops/_scaled_dot_product_flash_attention_for_cpu_backward_meta.h>
+#include <ATen/ops/_scaled_dot_product_fused_attention_overrideable_meta.h>
+#include <ATen/ops/_scaled_dot_product_fused_attention_overrideable_backward_meta.h>
+#include <ATen/ops/_scaled_grouped_mm_meta.h>
+#include <ATen/ops/_scaled_grouped_mm_v2_meta.h>
+#include <ATen/ops/_scaled_mm_meta.h>
+#include <ATen/ops/_scaled_mm_v2_meta.h>
+#include <ATen/ops/_segment_reduce_backward_meta.h>
+#include <ATen/ops/_shape_as_tensor_meta.h>
+#include <ATen/ops/_slow_conv2d_backward_meta.h>
+#include <ATen/ops/_slow_conv2d_forward_meta.h>
+#include <ATen/ops/_sobol_engine_draw_meta.h>
+#include <ATen/ops/_sobol_engine_ff_meta.h>
+#include <ATen/ops/_sobol_engine_initialize_state_meta.h>
+#include <ATen/ops/_sobol_engine_scramble_meta.h>
+#include <ATen/ops/_softmax_meta.h>
+#include <ATen/ops/_softmax_backward_data_meta.h>
+#include <ATen/ops/_sparse_addmm_meta.h>
+#include <ATen/ops/_sparse_broadcast_to_meta.h>
+#include <ATen/ops/_sparse_broadcast_to_copy_meta.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_compressed_tensor_with_dims_meta.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_meta.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_meta.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_csr_prod_meta.h>
+#include <ATen/ops/_sparse_csr_sum_meta.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe_meta.h>
+#include <ATen/ops/_sparse_log_softmax_meta.h>
+#include <ATen/ops/_sparse_log_softmax_backward_data_meta.h>
+#include <ATen/ops/_sparse_mask_projection_meta.h>
+#include <ATen/ops/_sparse_mm_meta.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_meta.h>
+#include <ATen/ops/_sparse_mm_reduce_impl_backward_meta.h>
+#include <ATen/ops/_sparse_semi_structured_addmm_meta.h>
+#include <ATen/ops/_sparse_semi_structured_apply_meta.h>
+#include <ATen/ops/_sparse_semi_structured_apply_dense_meta.h>
+#include <ATen/ops/_sparse_semi_structured_linear_meta.h>
+#include <ATen/ops/_sparse_semi_structured_mm_meta.h>
+#include <ATen/ops/_sparse_semi_structured_tile_meta.h>
+#include <ATen/ops/_sparse_softmax_meta.h>
+#include <ATen/ops/_sparse_softmax_backward_data_meta.h>
+#include <ATen/ops/_sparse_sparse_matmul_meta.h>
+#include <ATen/ops/_sparse_sum_meta.h>
+#include <ATen/ops/_sparse_sum_backward_meta.h>
+#include <ATen/ops/_spdiags_meta.h>
+#include <ATen/ops/_spsolve_meta.h>
+#include <ATen/ops/_stack_meta.h>
+#include <ATen/ops/_standard_gamma_meta.h>
+#include <ATen/ops/_standard_gamma_grad_meta.h>
+#include <ATen/ops/_test_ambiguous_defaults_meta.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_meta.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_meta.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_copy_meta.h>
+#include <ATen/ops/_test_check_tensor_meta.h>
+#include <ATen/ops/_test_functorch_fallback_meta.h>
+#include <ATen/ops/_test_optional_filled_intlist_meta.h>
+#include <ATen/ops/_test_optional_floatlist_meta.h>
+#include <ATen/ops/_test_optional_intlist_meta.h>
+#include <ATen/ops/_test_parallel_materialize_meta.h>
+#include <ATen/ops/_test_serialization_subcmul_meta.h>
+#include <ATen/ops/_test_string_default_meta.h>
+#include <ATen/ops/_test_warn_in_autograd_meta.h>
+#include <ATen/ops/_thnn_differentiable_gru_cell_backward_meta.h>
+#include <ATen/ops/_thnn_differentiable_lstm_cell_backward_meta.h>
+#include <ATen/ops/_thnn_fused_gru_cell_meta.h>
+#include <ATen/ops/_thnn_fused_gru_cell_backward_meta.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_meta.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_meta.h>
+#include <ATen/ops/_thnn_fused_lstm_cell_backward_impl_meta.h>
+#include <ATen/ops/_to_copy_meta.h>
+#include <ATen/ops/_to_cpu_meta.h>
+#include <ATen/ops/_to_dense_meta.h>
+#include <ATen/ops/_to_sparse_meta.h>
+#include <ATen/ops/_to_sparse_bsc_meta.h>
+#include <ATen/ops/_to_sparse_bsr_meta.h>
+#include <ATen/ops/_to_sparse_csc_meta.h>
+#include <ATen/ops/_to_sparse_csr_meta.h>
+#include <ATen/ops/_to_sparse_semi_structured_meta.h>
+#include <ATen/ops/_transform_bias_rescale_qkv_meta.h>
+#include <ATen/ops/_transformer_encoder_layer_fwd_meta.h>
+#include <ATen/ops/_trilinear_meta.h>
+#include <ATen/ops/_triton_multi_head_attention_meta.h>
+#include <ATen/ops/_triton_scaled_dot_attention_meta.h>
+#include <ATen/ops/_unique_meta.h>
+#include <ATen/ops/_unique2_meta.h>
+#include <ATen/ops/_unpack_dual_meta.h>
+#include <ATen/ops/_unsafe_index_meta.h>
+#include <ATen/ops/_unsafe_index_put_meta.h>
+#include <ATen/ops/_unsafe_masked_index_meta.h>
+#include <ATen/ops/_unsafe_masked_index_put_accumulate_meta.h>
+#include <ATen/ops/_unsafe_view_meta.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_meta.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_meta.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_meta.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_meta.h>
+#include <ATen/ops/_upsample_nearest_exact1d_meta.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_meta.h>
+#include <ATen/ops/_upsample_nearest_exact2d_meta.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_meta.h>
+#include <ATen/ops/_upsample_nearest_exact3d_meta.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_meta.h>
+#include <ATen/ops/_use_cudnn_ctc_loss_meta.h>
+#include <ATen/ops/_use_cudnn_rnn_flatten_weight_meta.h>
+#include <ATen/ops/_validate_compressed_sparse_indices_meta.h>
+#include <ATen/ops/_validate_sparse_bsc_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_bsr_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_compressed_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_coo_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_csc_tensor_args_meta.h>
+#include <ATen/ops/_validate_sparse_csr_tensor_args_meta.h>
+#include <ATen/ops/_values_meta.h>
+#include <ATen/ops/_values_copy_meta.h>
+#include <ATen/ops/_version_meta.h>
+#include <ATen/ops/_weight_int4pack_mm_meta.h>
+#include <ATen/ops/_weight_int4pack_mm_for_cpu_meta.h>
+#include <ATen/ops/_weight_int4pack_mm_with_scales_and_zeros_meta.h>
+#include <ATen/ops/_weight_int8pack_mm_meta.h>
+#include <ATen/ops/_weight_norm_meta.h>
+#include <ATen/ops/_weight_norm_differentiable_backward_meta.h>
+#include <ATen/ops/_weight_norm_interface_meta.h>
+#include <ATen/ops/_weight_norm_interface_backward_meta.h>
+#include <ATen/ops/_wrapped_linear_prepack_meta.h>
+#include <ATen/ops/_wrapped_quantized_linear_prepacked_meta.h>
+#include <ATen/ops/abs_meta.h>
+#include <ATen/ops/absolute_meta.h>
+#include <ATen/ops/acos_meta.h>
+#include <ATen/ops/acosh_meta.h>
+#include <ATen/ops/adaptive_avg_pool1d_meta.h>
+#include <ATen/ops/adaptive_avg_pool2d_meta.h>
+#include <ATen/ops/adaptive_avg_pool3d_meta.h>
+#include <ATen/ops/adaptive_avg_pool3d_backward_meta.h>
+#include <ATen/ops/adaptive_max_pool1d_meta.h>
+#include <ATen/ops/adaptive_max_pool2d_meta.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_meta.h>
+#include <ATen/ops/adaptive_max_pool3d_meta.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_meta.h>
+#include <ATen/ops/add_meta.h>
+#include <ATen/ops/addbmm_meta.h>
+#include <ATen/ops/addcdiv_meta.h>
+#include <ATen/ops/addcmul_meta.h>
+#include <ATen/ops/addmm_meta.h>
+#include <ATen/ops/addmv_meta.h>
+#include <ATen/ops/addr_meta.h>
+#include <ATen/ops/adjoint_meta.h>
+#include <ATen/ops/affine_grid_generator_meta.h>
+#include <ATen/ops/affine_grid_generator_backward_meta.h>
+#include <ATen/ops/alias_meta.h>
+#include <ATen/ops/alias_copy_meta.h>
+#include <ATen/ops/align_as_meta.h>
+#include <ATen/ops/align_tensors_meta.h>
+#include <ATen/ops/align_to_meta.h>
+#include <ATen/ops/all_meta.h>
+#include <ATen/ops/allclose_meta.h>
+#include <ATen/ops/alpha_dropout_meta.h>
+#include <ATen/ops/amax_meta.h>
+#include <ATen/ops/amin_meta.h>
+#include <ATen/ops/aminmax_meta.h>
+#include <ATen/ops/and_meta.h>
+#include <ATen/ops/angle_meta.h>
+#include <ATen/ops/any_meta.h>
+#include <ATen/ops/arange_meta.h>
+#include <ATen/ops/arccos_meta.h>
+#include <ATen/ops/arccosh_meta.h>
+#include <ATen/ops/arcsin_meta.h>
+#include <ATen/ops/arcsinh_meta.h>
+#include <ATen/ops/arctan_meta.h>
+#include <ATen/ops/arctan2_meta.h>
+#include <ATen/ops/arctanh_meta.h>
+#include <ATen/ops/argmax_meta.h>
+#include <ATen/ops/argmin_meta.h>
+#include <ATen/ops/argsort_meta.h>
+#include <ATen/ops/argwhere_meta.h>
+#include <ATen/ops/as_strided_meta.h>
+#include <ATen/ops/as_strided_copy_meta.h>
+#include <ATen/ops/as_strided_scatter_meta.h>
+#include <ATen/ops/asin_meta.h>
+#include <ATen/ops/asinh_meta.h>
+#include <ATen/ops/atan_meta.h>
+#include <ATen/ops/atan2_meta.h>
+#include <ATen/ops/atanh_meta.h>
+#include <ATen/ops/atleast_1d_meta.h>
+#include <ATen/ops/atleast_2d_meta.h>
+#include <ATen/ops/atleast_3d_meta.h>
+#include <ATen/ops/avg_pool1d_meta.h>
+#include <ATen/ops/avg_pool2d_meta.h>
+#include <ATen/ops/avg_pool2d_backward_meta.h>
+#include <ATen/ops/avg_pool3d_meta.h>
+#include <ATen/ops/avg_pool3d_backward_meta.h>
+#include <ATen/ops/baddbmm_meta.h>
+#include <ATen/ops/bartlett_window_meta.h>
+#include <ATen/ops/batch_norm_meta.h>
+#include <ATen/ops/batch_norm_backward_meta.h>
+#include <ATen/ops/batch_norm_backward_elemt_meta.h>
+#include <ATen/ops/batch_norm_backward_reduce_meta.h>
+#include <ATen/ops/batch_norm_elemt_meta.h>
+#include <ATen/ops/batch_norm_gather_stats_meta.h>
+#include <ATen/ops/batch_norm_gather_stats_with_counts_meta.h>
+#include <ATen/ops/batch_norm_stats_meta.h>
+#include <ATen/ops/batch_norm_update_stats_meta.h>
+#include <ATen/ops/bernoulli_meta.h>
+#include <ATen/ops/bilinear_meta.h>
+#include <ATen/ops/binary_cross_entropy_meta.h>
+#include <ATen/ops/binary_cross_entropy_backward_meta.h>
+#include <ATen/ops/binary_cross_entropy_with_logits_meta.h>
+#include <ATen/ops/bincount_meta.h>
+#include <ATen/ops/binomial_meta.h>
+#include <ATen/ops/bitwise_and_meta.h>
+#include <ATen/ops/bitwise_left_shift_meta.h>
+#include <ATen/ops/bitwise_not_meta.h>
+#include <ATen/ops/bitwise_or_meta.h>
+#include <ATen/ops/bitwise_right_shift_meta.h>
+#include <ATen/ops/bitwise_xor_meta.h>
+#include <ATen/ops/blackman_window_meta.h>
+#include <ATen/ops/block_diag_meta.h>
+#include <ATen/ops/bmm_meta.h>
+#include <ATen/ops/broadcast_tensors_meta.h>
+#include <ATen/ops/broadcast_to_meta.h>
+#include <ATen/ops/bucketize_meta.h>
+#include <ATen/ops/can_cast_meta.h>
+#include <ATen/ops/cartesian_prod_meta.h>
+#include <ATen/ops/cat_meta.h>
+#include <ATen/ops/cauchy_meta.h>
+#include <ATen/ops/ccol_indices_meta.h>
+#include <ATen/ops/ccol_indices_copy_meta.h>
+#include <ATen/ops/cdist_meta.h>
+#include <ATen/ops/ceil_meta.h>
+#include <ATen/ops/celu_meta.h>
+#include <ATen/ops/chain_matmul_meta.h>
+#include <ATen/ops/chalf_meta.h>
+#include <ATen/ops/channel_shuffle_meta.h>
+#include <ATen/ops/cholesky_meta.h>
+#include <ATen/ops/cholesky_inverse_meta.h>
+#include <ATen/ops/cholesky_solve_meta.h>
+#include <ATen/ops/choose_qparams_optimized_meta.h>
+#include <ATen/ops/chunk_meta.h>
+#include <ATen/ops/clamp_meta.h>
+#include <ATen/ops/clamp_max_meta.h>
+#include <ATen/ops/clamp_min_meta.h>
+#include <ATen/ops/clip_meta.h>
+#include <ATen/ops/clone_meta.h>
+#include <ATen/ops/coalesce_meta.h>
+#include <ATen/ops/col2im_meta.h>
+#include <ATen/ops/col_indices_meta.h>
+#include <ATen/ops/col_indices_copy_meta.h>
+#include <ATen/ops/column_stack_meta.h>
+#include <ATen/ops/combinations_meta.h>
+#include <ATen/ops/complex_meta.h>
+#include <ATen/ops/concat_meta.h>
+#include <ATen/ops/concatenate_meta.h>
+#include <ATen/ops/conj_meta.h>
+#include <ATen/ops/conj_physical_meta.h>
+#include <ATen/ops/constant_pad_nd_meta.h>
+#include <ATen/ops/contiguous_meta.h>
+#include <ATen/ops/conv1d_meta.h>
+#include <ATen/ops/conv2d_meta.h>
+#include <ATen/ops/conv3d_meta.h>
+#include <ATen/ops/conv_depthwise3d_meta.h>
+#include <ATen/ops/conv_tbc_meta.h>
+#include <ATen/ops/conv_tbc_backward_meta.h>
+#include <ATen/ops/conv_transpose1d_meta.h>
+#include <ATen/ops/conv_transpose2d_meta.h>
+#include <ATen/ops/conv_transpose3d_meta.h>
+#include <ATen/ops/convolution_meta.h>
+#include <ATen/ops/convolution_backward_meta.h>
+#include <ATen/ops/convolution_backward_overrideable_meta.h>
+#include <ATen/ops/convolution_overrideable_meta.h>
+#include <ATen/ops/copy_meta.h>
+#include <ATen/ops/copy_sparse_to_sparse_meta.h>
+#include <ATen/ops/copysign_meta.h>
+#include <ATen/ops/corrcoef_meta.h>
+#include <ATen/ops/cos_meta.h>
+#include <ATen/ops/cosh_meta.h>
+#include <ATen/ops/cosine_embedding_loss_meta.h>
+#include <ATen/ops/cosine_similarity_meta.h>
+#include <ATen/ops/count_nonzero_meta.h>
+#include <ATen/ops/cov_meta.h>
+#include <ATen/ops/cross_meta.h>
+#include <ATen/ops/cross_entropy_loss_meta.h>
+#include <ATen/ops/crow_indices_meta.h>
+#include <ATen/ops/crow_indices_copy_meta.h>
+#include <ATen/ops/ctc_loss_meta.h>
+#include <ATen/ops/cudnn_affine_grid_generator_meta.h>
+#include <ATen/ops/cudnn_affine_grid_generator_backward_meta.h>
+#include <ATen/ops/cudnn_batch_norm_meta.h>
+#include <ATen/ops/cudnn_batch_norm_backward_meta.h>
+#include <ATen/ops/cudnn_convolution_meta.h>
+#include <ATen/ops/cudnn_convolution_add_relu_meta.h>
+#include <ATen/ops/cudnn_convolution_relu_meta.h>
+#include <ATen/ops/cudnn_convolution_transpose_meta.h>
+#include <ATen/ops/cudnn_grid_sampler_meta.h>
+#include <ATen/ops/cudnn_grid_sampler_backward_meta.h>
+#include <ATen/ops/cudnn_is_acceptable_meta.h>
+#include <ATen/ops/cummax_meta.h>
+#include <ATen/ops/cummaxmin_backward_meta.h>
+#include <ATen/ops/cummin_meta.h>
+#include <ATen/ops/cumprod_meta.h>
+#include <ATen/ops/cumprod_backward_meta.h>
+#include <ATen/ops/cumsum_meta.h>
+#include <ATen/ops/cumulative_trapezoid_meta.h>
+#include <ATen/ops/data_meta.h>
+#include <ATen/ops/deg2rad_meta.h>
+#include <ATen/ops/dense_dim_meta.h>
+#include <ATen/ops/dequantize_meta.h>
+#include <ATen/ops/det_meta.h>
+#include <ATen/ops/detach_meta.h>
+#include <ATen/ops/detach_copy_meta.h>
+#include <ATen/ops/diag_meta.h>
+#include <ATen/ops/diag_embed_meta.h>
+#include <ATen/ops/diagflat_meta.h>
+#include <ATen/ops/diagonal_meta.h>
+#include <ATen/ops/diagonal_backward_meta.h>
+#include <ATen/ops/diagonal_copy_meta.h>
+#include <ATen/ops/diagonal_scatter_meta.h>
+#include <ATen/ops/diff_meta.h>
+#include <ATen/ops/digamma_meta.h>
+#include <ATen/ops/dist_meta.h>
+#include <ATen/ops/div_meta.h>
+#include <ATen/ops/divide_meta.h>
+#include <ATen/ops/dot_meta.h>
+#include <ATen/ops/dropout_meta.h>
+#include <ATen/ops/dsplit_meta.h>
+#include <ATen/ops/dstack_meta.h>
+#include <ATen/ops/einsum_meta.h>
+#include <ATen/ops/elu_meta.h>
+#include <ATen/ops/elu_backward_meta.h>
+#include <ATen/ops/embedding_meta.h>
+#include <ATen/ops/embedding_backward_meta.h>
+#include <ATen/ops/embedding_bag_meta.h>
+#include <ATen/ops/embedding_dense_backward_meta.h>
+#include <ATen/ops/embedding_renorm_meta.h>
+#include <ATen/ops/embedding_sparse_backward_meta.h>
+#include <ATen/ops/empty_meta.h>
+#include <ATen/ops/empty_like_meta.h>
+#include <ATen/ops/empty_permuted_meta.h>
+#include <ATen/ops/empty_quantized_meta.h>
+#include <ATen/ops/empty_strided_meta.h>
+#include <ATen/ops/eq_meta.h>
+#include <ATen/ops/equal_meta.h>
+#include <ATen/ops/erf_meta.h>
+#include <ATen/ops/erfc_meta.h>
+#include <ATen/ops/erfinv_meta.h>
+#include <ATen/ops/exp_meta.h>
+#include <ATen/ops/exp2_meta.h>
+#include <ATen/ops/expand_meta.h>
+#include <ATen/ops/expand_as_meta.h>
+#include <ATen/ops/expand_copy_meta.h>
+#include <ATen/ops/expm1_meta.h>
+#include <ATen/ops/exponential_meta.h>
+#include <ATen/ops/eye_meta.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_meta.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_meta.h>
+#include <ATen/ops/fake_quantize_per_channel_affine_cachemask_backward_meta.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_meta.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_meta.h>
+#include <ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_meta.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_meta.h>
+#include <ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_meta.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_meta.h>
+#include <ATen/ops/fbgemm_linear_int8_weight_fp32_activation_meta.h>
+#include <ATen/ops/fbgemm_linear_quantize_weight_meta.h>
+#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16_meta.h>
+#include <ATen/ops/fbgemm_pack_quantized_matrix_meta.h>
+#include <ATen/ops/feature_alpha_dropout_meta.h>
+#include <ATen/ops/feature_dropout_meta.h>
+#include <ATen/ops/fft_fft_meta.h>
+#include <ATen/ops/fft_fft2_meta.h>
+#include <ATen/ops/fft_fftfreq_meta.h>
+#include <ATen/ops/fft_fftn_meta.h>
+#include <ATen/ops/fft_fftshift_meta.h>
+#include <ATen/ops/fft_hfft_meta.h>
+#include <ATen/ops/fft_hfft2_meta.h>
+#include <ATen/ops/fft_hfftn_meta.h>
+#include <ATen/ops/fft_ifft_meta.h>
+#include <ATen/ops/fft_ifft2_meta.h>
+#include <ATen/ops/fft_ifftn_meta.h>
+#include <ATen/ops/fft_ifftshift_meta.h>
+#include <ATen/ops/fft_ihfft_meta.h>
+#include <ATen/ops/fft_ihfft2_meta.h>
+#include <ATen/ops/fft_ihfftn_meta.h>
+#include <ATen/ops/fft_irfft_meta.h>
+#include <ATen/ops/fft_irfft2_meta.h>
+#include <ATen/ops/fft_irfftn_meta.h>
+#include <ATen/ops/fft_rfft_meta.h>
+#include <ATen/ops/fft_rfft2_meta.h>
+#include <ATen/ops/fft_rfftfreq_meta.h>
+#include <ATen/ops/fft_rfftn_meta.h>
+#include <ATen/ops/fill_meta.h>
+#include <ATen/ops/fill_diagonal_meta.h>
+#include <ATen/ops/fix_meta.h>
+#include <ATen/ops/flatten_meta.h>
+#include <ATen/ops/flatten_dense_tensors_meta.h>
+#include <ATen/ops/flip_meta.h>
+#include <ATen/ops/fliplr_meta.h>
+#include <ATen/ops/flipud_meta.h>
+#include <ATen/ops/float_power_meta.h>
+#include <ATen/ops/floor_meta.h>
+#include <ATen/ops/floor_divide_meta.h>
+#include <ATen/ops/fmax_meta.h>
+#include <ATen/ops/fmin_meta.h>
+#include <ATen/ops/fmod_meta.h>
+#include <ATen/ops/frac_meta.h>
+#include <ATen/ops/fractional_max_pool2d_meta.h>
+#include <ATen/ops/fractional_max_pool2d_backward_meta.h>
+#include <ATen/ops/fractional_max_pool3d_meta.h>
+#include <ATen/ops/fractional_max_pool3d_backward_meta.h>
+#include <ATen/ops/frexp_meta.h>
+#include <ATen/ops/frobenius_norm_meta.h>
+#include <ATen/ops/from_file_meta.h>
+#include <ATen/ops/full_meta.h>
+#include <ATen/ops/full_like_meta.h>
+#include <ATen/ops/fused_moving_avg_obs_fake_quant_meta.h>
+#include <ATen/ops/gather_meta.h>
+#include <ATen/ops/gather_backward_meta.h>
+#include <ATen/ops/gcd_meta.h>
+#include <ATen/ops/ge_meta.h>
+#include <ATen/ops/gelu_meta.h>
+#include <ATen/ops/gelu_backward_meta.h>
+#include <ATen/ops/geometric_meta.h>
+#include <ATen/ops/geqrf_meta.h>
+#include <ATen/ops/ger_meta.h>
+#include <ATen/ops/glu_meta.h>
+#include <ATen/ops/glu_backward_meta.h>
+#include <ATen/ops/glu_backward_jvp_meta.h>
+#include <ATen/ops/glu_jvp_meta.h>
+#include <ATen/ops/gradient_meta.h>
+#include <ATen/ops/greater_meta.h>
+#include <ATen/ops/greater_equal_meta.h>
+#include <ATen/ops/grid_sampler_meta.h>
+#include <ATen/ops/grid_sampler_2d_meta.h>
+#include <ATen/ops/grid_sampler_2d_backward_meta.h>
+#include <ATen/ops/grid_sampler_3d_meta.h>
+#include <ATen/ops/grid_sampler_3d_backward_meta.h>
+#include <ATen/ops/group_norm_meta.h>
+#include <ATen/ops/gru_meta.h>
+#include <ATen/ops/gru_cell_meta.h>
+#include <ATen/ops/gt_meta.h>
+#include <ATen/ops/hamming_window_meta.h>
+#include <ATen/ops/hann_window_meta.h>
+#include <ATen/ops/hardshrink_meta.h>
+#include <ATen/ops/hardshrink_backward_meta.h>
+#include <ATen/ops/hardsigmoid_meta.h>
+#include <ATen/ops/hardsigmoid_backward_meta.h>
+#include <ATen/ops/hardswish_meta.h>
+#include <ATen/ops/hardswish_backward_meta.h>
+#include <ATen/ops/hardtanh_meta.h>
+#include <ATen/ops/hardtanh_backward_meta.h>
+#include <ATen/ops/hash_tensor_meta.h>
+#include <ATen/ops/heaviside_meta.h>
+#include <ATen/ops/hinge_embedding_loss_meta.h>
+#include <ATen/ops/histc_meta.h>
+#include <ATen/ops/histogram_meta.h>
+#include <ATen/ops/histogramdd_meta.h>
+#include <ATen/ops/hsplit_meta.h>
+#include <ATen/ops/hspmm_meta.h>
+#include <ATen/ops/hstack_meta.h>
+#include <ATen/ops/huber_loss_meta.h>
+#include <ATen/ops/huber_loss_backward_meta.h>
+#include <ATen/ops/hypot_meta.h>
+#include <ATen/ops/i0_meta.h>
+#include <ATen/ops/igamma_meta.h>
+#include <ATen/ops/igammac_meta.h>
+#include <ATen/ops/im2col_meta.h>
+#include <ATen/ops/imag_meta.h>
+#include <ATen/ops/index_meta.h>
+#include <ATen/ops/index_add_meta.h>
+#include <ATen/ops/index_copy_meta.h>
+#include <ATen/ops/index_fill_meta.h>
+#include <ATen/ops/index_put_meta.h>
+#include <ATen/ops/index_reduce_meta.h>
+#include <ATen/ops/index_select_meta.h>
+#include <ATen/ops/index_select_backward_meta.h>
+#include <ATen/ops/indices_meta.h>
+#include <ATen/ops/indices_copy_meta.h>
+#include <ATen/ops/infinitely_differentiable_gelu_backward_meta.h>
+#include <ATen/ops/inner_meta.h>
+#include <ATen/ops/instance_norm_meta.h>
+#include <ATen/ops/int_repr_meta.h>
+#include <ATen/ops/inverse_meta.h>
+#include <ATen/ops/is_coalesced_meta.h>
+#include <ATen/ops/is_complex_meta.h>
+#include <ATen/ops/is_conj_meta.h>
+#include <ATen/ops/is_distributed_meta.h>
+#include <ATen/ops/is_floating_point_meta.h>
+#include <ATen/ops/is_inference_meta.h>
+#include <ATen/ops/is_leaf_meta.h>
+#include <ATen/ops/is_neg_meta.h>
+#include <ATen/ops/is_nonzero_meta.h>
+#include <ATen/ops/is_pinned_meta.h>
+#include <ATen/ops/is_same_size_meta.h>
+#include <ATen/ops/is_set_to_meta.h>
+#include <ATen/ops/is_signed_meta.h>
+#include <ATen/ops/is_vulkan_available_meta.h>
+#include <ATen/ops/isclose_meta.h>
+#include <ATen/ops/isfinite_meta.h>
+#include <ATen/ops/isin_meta.h>
+#include <ATen/ops/isinf_meta.h>
+#include <ATen/ops/isnan_meta.h>
+#include <ATen/ops/isneginf_meta.h>
+#include <ATen/ops/isposinf_meta.h>
+#include <ATen/ops/isreal_meta.h>
+#include <ATen/ops/istft_meta.h>
+#include <ATen/ops/item_meta.h>
+#include <ATen/ops/kaiser_window_meta.h>
+#include <ATen/ops/kl_div_meta.h>
+#include <ATen/ops/kron_meta.h>
+#include <ATen/ops/kthvalue_meta.h>
+#include <ATen/ops/l1_loss_meta.h>
+#include <ATen/ops/layer_norm_meta.h>
+#include <ATen/ops/lcm_meta.h>
+#include <ATen/ops/ldexp_meta.h>
+#include <ATen/ops/le_meta.h>
+#include <ATen/ops/leaky_relu_meta.h>
+#include <ATen/ops/leaky_relu_backward_meta.h>
+#include <ATen/ops/lerp_meta.h>
+#include <ATen/ops/less_meta.h>
+#include <ATen/ops/less_equal_meta.h>
+#include <ATen/ops/lgamma_meta.h>
+#include <ATen/ops/lift_meta.h>
+#include <ATen/ops/lift_fresh_meta.h>
+#include <ATen/ops/lift_fresh_copy_meta.h>
+#include <ATen/ops/linalg_cholesky_meta.h>
+#include <ATen/ops/linalg_cholesky_ex_meta.h>
+#include <ATen/ops/linalg_cond_meta.h>
+#include <ATen/ops/linalg_cross_meta.h>
+#include <ATen/ops/linalg_det_meta.h>
+#include <ATen/ops/linalg_diagonal_meta.h>
+#include <ATen/ops/linalg_eig_meta.h>
+#include <ATen/ops/linalg_eigh_meta.h>
+#include <ATen/ops/linalg_eigvals_meta.h>
+#include <ATen/ops/linalg_eigvalsh_meta.h>
+#include <ATen/ops/linalg_householder_product_meta.h>
+#include <ATen/ops/linalg_inv_meta.h>
+#include <ATen/ops/linalg_inv_ex_meta.h>
+#include <ATen/ops/linalg_ldl_factor_meta.h>
+#include <ATen/ops/linalg_ldl_factor_ex_meta.h>
+#include <ATen/ops/linalg_ldl_solve_meta.h>
+#include <ATen/ops/linalg_lstsq_meta.h>
+#include <ATen/ops/linalg_lu_meta.h>
+#include <ATen/ops/linalg_lu_factor_meta.h>
+#include <ATen/ops/linalg_lu_factor_ex_meta.h>
+#include <ATen/ops/linalg_lu_solve_meta.h>
+#include <ATen/ops/linalg_matmul_meta.h>
+#include <ATen/ops/linalg_matrix_exp_meta.h>
+#include <ATen/ops/linalg_matrix_norm_meta.h>
+#include <ATen/ops/linalg_matrix_power_meta.h>
+#include <ATen/ops/linalg_matrix_rank_meta.h>
+#include <ATen/ops/linalg_multi_dot_meta.h>
+#include <ATen/ops/linalg_norm_meta.h>
+#include <ATen/ops/linalg_pinv_meta.h>
+#include <ATen/ops/linalg_qr_meta.h>
+#include <ATen/ops/linalg_slogdet_meta.h>
+#include <ATen/ops/linalg_solve_meta.h>
+#include <ATen/ops/linalg_solve_ex_meta.h>
+#include <ATen/ops/linalg_solve_triangular_meta.h>
+#include <ATen/ops/linalg_svd_meta.h>
+#include <ATen/ops/linalg_svdvals_meta.h>
+#include <ATen/ops/linalg_tensorinv_meta.h>
+#include <ATen/ops/linalg_tensorsolve_meta.h>
+#include <ATen/ops/linalg_vander_meta.h>
+#include <ATen/ops/linalg_vecdot_meta.h>
+#include <ATen/ops/linalg_vector_norm_meta.h>
+#include <ATen/ops/linear_meta.h>
+#include <ATen/ops/linear_backward_meta.h>
+#include <ATen/ops/linspace_meta.h>
+#include <ATen/ops/log_meta.h>
+#include <ATen/ops/log10_meta.h>
+#include <ATen/ops/log1p_meta.h>
+#include <ATen/ops/log2_meta.h>
+#include <ATen/ops/log_normal_meta.h>
+#include <ATen/ops/log_sigmoid_meta.h>
+#include <ATen/ops/log_sigmoid_backward_meta.h>
+#include <ATen/ops/log_sigmoid_forward_meta.h>
+#include <ATen/ops/log_softmax_meta.h>
+#include <ATen/ops/logaddexp_meta.h>
+#include <ATen/ops/logaddexp2_meta.h>
+#include <ATen/ops/logcumsumexp_meta.h>
+#include <ATen/ops/logdet_meta.h>
+#include <ATen/ops/logical_and_meta.h>
+#include <ATen/ops/logical_not_meta.h>
+#include <ATen/ops/logical_or_meta.h>
+#include <ATen/ops/logical_xor_meta.h>
+#include <ATen/ops/logit_meta.h>
+#include <ATen/ops/logit_backward_meta.h>
+#include <ATen/ops/logspace_meta.h>
+#include <ATen/ops/logsumexp_meta.h>
+#include <ATen/ops/lshift_meta.h>
+#include <ATen/ops/lstm_meta.h>
+#include <ATen/ops/lstm_cell_meta.h>
+#include <ATen/ops/lstm_mps_backward_meta.h>
+#include <ATen/ops/lt_meta.h>
+#include <ATen/ops/lu_solve_meta.h>
+#include <ATen/ops/lu_unpack_meta.h>
+#include <ATen/ops/mH_meta.h>
+#include <ATen/ops/mT_meta.h>
+#include <ATen/ops/margin_ranking_loss_meta.h>
+#include <ATen/ops/masked_fill_meta.h>
+#include <ATen/ops/masked_scatter_meta.h>
+#include <ATen/ops/masked_scatter_backward_meta.h>
+#include <ATen/ops/masked_select_meta.h>
+#include <ATen/ops/masked_select_backward_meta.h>
+#include <ATen/ops/matmul_meta.h>
+#include <ATen/ops/matmul_backward_meta.h>
+#include <ATen/ops/matrix_H_meta.h>
+#include <ATen/ops/matrix_exp_meta.h>
+#include <ATen/ops/matrix_exp_backward_meta.h>
+#include <ATen/ops/matrix_power_meta.h>
+#include <ATen/ops/max_meta.h>
+#include <ATen/ops/max_pool1d_meta.h>
+#include <ATen/ops/max_pool1d_with_indices_meta.h>
+#include <ATen/ops/max_pool2d_meta.h>
+#include <ATen/ops/max_pool2d_backward_meta.h>
+#include <ATen/ops/max_pool2d_with_indices_meta.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_meta.h>
+#include <ATen/ops/max_pool3d_meta.h>
+#include <ATen/ops/max_pool3d_with_indices_meta.h>
+#include <ATen/ops/max_pool3d_with_indices_backward_meta.h>
+#include <ATen/ops/max_unpool2d_meta.h>
+#include <ATen/ops/max_unpool3d_meta.h>
+#include <ATen/ops/maximum_meta.h>
+#include <ATen/ops/mean_meta.h>
+#include <ATen/ops/median_meta.h>
+#include <ATen/ops/meshgrid_meta.h>
+#include <ATen/ops/min_meta.h>
+#include <ATen/ops/minimum_meta.h>
+#include <ATen/ops/miopen_batch_norm_meta.h>
+#include <ATen/ops/miopen_batch_norm_backward_meta.h>
+#include <ATen/ops/miopen_convolution_meta.h>
+#include <ATen/ops/miopen_convolution_add_relu_meta.h>
+#include <ATen/ops/miopen_convolution_relu_meta.h>
+#include <ATen/ops/miopen_convolution_transpose_meta.h>
+#include <ATen/ops/miopen_depthwise_convolution_meta.h>
+#include <ATen/ops/miopen_rnn_meta.h>
+#include <ATen/ops/miopen_rnn_backward_meta.h>
+#include <ATen/ops/mish_meta.h>
+#include <ATen/ops/mish_backward_meta.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_meta.h>
+#include <ATen/ops/mkldnn_adaptive_avg_pool2d_backward_meta.h>
+#include <ATen/ops/mkldnn_convolution_meta.h>
+#include <ATen/ops/mkldnn_linear_meta.h>
+#include <ATen/ops/mkldnn_linear_backward_meta.h>
+#include <ATen/ops/mkldnn_linear_backward_input_meta.h>
+#include <ATen/ops/mkldnn_linear_backward_weights_meta.h>
+#include <ATen/ops/mkldnn_max_pool2d_meta.h>
+#include <ATen/ops/mkldnn_max_pool2d_backward_meta.h>
+#include <ATen/ops/mkldnn_max_pool3d_meta.h>
+#include <ATen/ops/mkldnn_max_pool3d_backward_meta.h>
+#include <ATen/ops/mkldnn_reorder_conv2d_weight_meta.h>
+#include <ATen/ops/mkldnn_reorder_conv3d_weight_meta.h>
+#include <ATen/ops/mkldnn_rnn_layer_meta.h>
+#include <ATen/ops/mkldnn_rnn_layer_backward_meta.h>
+#include <ATen/ops/mm_meta.h>
+#include <ATen/ops/mode_meta.h>
+#include <ATen/ops/moveaxis_meta.h>
+#include <ATen/ops/movedim_meta.h>
+#include <ATen/ops/mps_convolution_backward_meta.h>
+#include <ATen/ops/mps_convolution_transpose_backward_meta.h>
+#include <ATen/ops/mse_loss_meta.h>
+#include <ATen/ops/mse_loss_backward_meta.h>
+#include <ATen/ops/msort_meta.h>
+#include <ATen/ops/mul_meta.h>
+#include <ATen/ops/multi_margin_loss_meta.h>
+#include <ATen/ops/multi_margin_loss_backward_meta.h>
+#include <ATen/ops/multilabel_margin_loss_meta.h>
+#include <ATen/ops/multilabel_margin_loss_backward_meta.h>
+#include <ATen/ops/multilabel_margin_loss_forward_meta.h>
+#include <ATen/ops/multinomial_meta.h>
+#include <ATen/ops/multiply_meta.h>
+#include <ATen/ops/mv_meta.h>
+#include <ATen/ops/mvlgamma_meta.h>
+#include <ATen/ops/nan_to_num_meta.h>
+#include <ATen/ops/nanmean_meta.h>
+#include <ATen/ops/nanmedian_meta.h>
+#include <ATen/ops/nanquantile_meta.h>
+#include <ATen/ops/nansum_meta.h>
+#include <ATen/ops/narrow_meta.h>
+#include <ATen/ops/narrow_copy_meta.h>
+#include <ATen/ops/native_batch_norm_meta.h>
+#include <ATen/ops/native_batch_norm_backward_meta.h>
+#include <ATen/ops/native_channel_shuffle_meta.h>
+#include <ATen/ops/native_dropout_meta.h>
+#include <ATen/ops/native_dropout_backward_meta.h>
+#include <ATen/ops/native_group_norm_meta.h>
+#include <ATen/ops/native_group_norm_backward_meta.h>
+#include <ATen/ops/native_layer_norm_meta.h>
+#include <ATen/ops/native_layer_norm_backward_meta.h>
+#include <ATen/ops/native_norm_meta.h>
+#include <ATen/ops/ne_meta.h>
+#include <ATen/ops/neg_meta.h>
+#include <ATen/ops/negative_meta.h>
+#include <ATen/ops/nested_to_padded_tensor_meta.h>
+#include <ATen/ops/new_empty_meta.h>
+#include <ATen/ops/new_empty_strided_meta.h>
+#include <ATen/ops/new_full_meta.h>
+#include <ATen/ops/new_ones_meta.h>
+#include <ATen/ops/new_zeros_meta.h>
+#include <ATen/ops/nextafter_meta.h>
+#include <ATen/ops/nll_loss_meta.h>
+#include <ATen/ops/nll_loss2d_meta.h>
+#include <ATen/ops/nll_loss2d_backward_meta.h>
+#include <ATen/ops/nll_loss2d_forward_meta.h>
+#include <ATen/ops/nll_loss_backward_meta.h>
+#include <ATen/ops/nll_loss_forward_meta.h>
+#include <ATen/ops/nll_loss_nd_meta.h>
+#include <ATen/ops/nonzero_meta.h>
+#include <ATen/ops/nonzero_numpy_meta.h>
+#include <ATen/ops/nonzero_static_meta.h>
+#include <ATen/ops/norm_meta.h>
+#include <ATen/ops/norm_except_dim_meta.h>
+#include <ATen/ops/normal_meta.h>
+#include <ATen/ops/not_equal_meta.h>
+#include <ATen/ops/nuclear_norm_meta.h>
+#include <ATen/ops/numpy_T_meta.h>
+#include <ATen/ops/one_hot_meta.h>
+#include <ATen/ops/ones_meta.h>
+#include <ATen/ops/ones_like_meta.h>
+#include <ATen/ops/or_meta.h>
+#include <ATen/ops/orgqr_meta.h>
+#include <ATen/ops/ormqr_meta.h>
+#include <ATen/ops/outer_meta.h>
+#include <ATen/ops/output_nr_meta.h>
+#include <ATen/ops/pad_meta.h>
+#include <ATen/ops/pad_sequence_meta.h>
+#include <ATen/ops/pairwise_distance_meta.h>
+#include <ATen/ops/pdist_meta.h>
+#include <ATen/ops/permute_meta.h>
+#include <ATen/ops/permute_copy_meta.h>
+#include <ATen/ops/pin_memory_meta.h>
+#include <ATen/ops/pinverse_meta.h>
+#include <ATen/ops/pixel_shuffle_meta.h>
+#include <ATen/ops/pixel_unshuffle_meta.h>
+#include <ATen/ops/poisson_meta.h>
+#include <ATen/ops/poisson_nll_loss_meta.h>
+#include <ATen/ops/polar_meta.h>
+#include <ATen/ops/polygamma_meta.h>
+#include <ATen/ops/positive_meta.h>
+#include <ATen/ops/pow_meta.h>
+#include <ATen/ops/prelu_meta.h>
+#include <ATen/ops/prod_meta.h>
+#include <ATen/ops/promote_types_meta.h>
+#include <ATen/ops/put_meta.h>
+#include <ATen/ops/q_per_channel_axis_meta.h>
+#include <ATen/ops/q_per_channel_scales_meta.h>
+#include <ATen/ops/q_per_channel_zero_points_meta.h>
+#include <ATen/ops/q_scale_meta.h>
+#include <ATen/ops/q_zero_point_meta.h>
+#include <ATen/ops/qr_meta.h>
+#include <ATen/ops/qscheme_meta.h>
+#include <ATen/ops/quantile_meta.h>
+#include <ATen/ops/quantize_per_channel_meta.h>
+#include <ATen/ops/quantize_per_tensor_meta.h>
+#include <ATen/ops/quantize_per_tensor_dynamic_meta.h>
+#include <ATen/ops/quantized_batch_norm_meta.h>
+#include <ATen/ops/quantized_gru_cell_meta.h>
+#include <ATen/ops/quantized_lstm_cell_meta.h>
+#include <ATen/ops/quantized_max_pool1d_meta.h>
+#include <ATen/ops/quantized_max_pool2d_meta.h>
+#include <ATen/ops/quantized_max_pool3d_meta.h>
+#include <ATen/ops/quantized_rnn_relu_cell_meta.h>
+#include <ATen/ops/quantized_rnn_tanh_cell_meta.h>
+#include <ATen/ops/rad2deg_meta.h>
+#include <ATen/ops/rand_meta.h>
+#include <ATen/ops/rand_like_meta.h>
+#include <ATen/ops/randint_meta.h>
+#include <ATen/ops/randint_like_meta.h>
+#include <ATen/ops/randn_meta.h>
+#include <ATen/ops/randn_like_meta.h>
+#include <ATen/ops/random_meta.h>
+#include <ATen/ops/randperm_meta.h>
+#include <ATen/ops/range_meta.h>
+#include <ATen/ops/ravel_meta.h>
+#include <ATen/ops/real_meta.h>
+#include <ATen/ops/reciprocal_meta.h>
+#include <ATen/ops/record_stream_meta.h>
+#include <ATen/ops/refine_names_meta.h>
+#include <ATen/ops/reflection_pad1d_meta.h>
+#include <ATen/ops/reflection_pad1d_backward_meta.h>
+#include <ATen/ops/reflection_pad2d_meta.h>
+#include <ATen/ops/reflection_pad2d_backward_meta.h>
+#include <ATen/ops/reflection_pad3d_meta.h>
+#include <ATen/ops/reflection_pad3d_backward_meta.h>
+#include <ATen/ops/relu_meta.h>
+#include <ATen/ops/relu6_meta.h>
+#include <ATen/ops/remainder_meta.h>
+#include <ATen/ops/rename_meta.h>
+#include <ATen/ops/renorm_meta.h>
+#include <ATen/ops/repeat_meta.h>
+#include <ATen/ops/repeat_interleave_meta.h>
+#include <ATen/ops/replication_pad1d_meta.h>
+#include <ATen/ops/replication_pad1d_backward_meta.h>
+#include <ATen/ops/replication_pad2d_meta.h>
+#include <ATen/ops/replication_pad2d_backward_meta.h>
+#include <ATen/ops/replication_pad3d_meta.h>
+#include <ATen/ops/replication_pad3d_backward_meta.h>
+#include <ATen/ops/requires_grad_meta.h>
+#include <ATen/ops/reshape_meta.h>
+#include <ATen/ops/reshape_as_meta.h>
+#include <ATen/ops/resize_meta.h>
+#include <ATen/ops/resize_as_meta.h>
+#include <ATen/ops/resize_as_sparse_meta.h>
+#include <ATen/ops/resolve_conj_meta.h>
+#include <ATen/ops/resolve_neg_meta.h>
+#include <ATen/ops/result_type_meta.h>
+#include <ATen/ops/retain_grad_meta.h>
+#include <ATen/ops/retains_grad_meta.h>
+#include <ATen/ops/rms_norm_meta.h>
+#include <ATen/ops/rnn_relu_meta.h>
+#include <ATen/ops/rnn_relu_cell_meta.h>
+#include <ATen/ops/rnn_tanh_meta.h>
+#include <ATen/ops/rnn_tanh_cell_meta.h>
+#include <ATen/ops/roll_meta.h>
+#include <ATen/ops/rot90_meta.h>
+#include <ATen/ops/round_meta.h>
+#include <ATen/ops/row_indices_meta.h>
+#include <ATen/ops/row_indices_copy_meta.h>
+#include <ATen/ops/row_stack_meta.h>
+#include <ATen/ops/rrelu_meta.h>
+#include <ATen/ops/rrelu_with_noise_meta.h>
+#include <ATen/ops/rrelu_with_noise_backward_meta.h>
+#include <ATen/ops/rshift_meta.h>
+#include <ATen/ops/rsqrt_meta.h>
+#include <ATen/ops/rsub_meta.h>
+#include <ATen/ops/scalar_tensor_meta.h>
+#include <ATen/ops/scaled_dot_product_attention_meta.h>
+#include <ATen/ops/scatter_meta.h>
+#include <ATen/ops/scatter_add_meta.h>
+#include <ATen/ops/scatter_reduce_meta.h>
+#include <ATen/ops/searchsorted_meta.h>
+#include <ATen/ops/segment_reduce_meta.h>
+#include <ATen/ops/select_meta.h>
+#include <ATen/ops/select_backward_meta.h>
+#include <ATen/ops/select_copy_meta.h>
+#include <ATen/ops/select_scatter_meta.h>
+#include <ATen/ops/selu_meta.h>
+#include <ATen/ops/set_meta.h>
+#include <ATen/ops/set_data_meta.h>
+#include <ATen/ops/sgn_meta.h>
+#include <ATen/ops/sigmoid_meta.h>
+#include <ATen/ops/sigmoid_backward_meta.h>
+#include <ATen/ops/sign_meta.h>
+#include <ATen/ops/signbit_meta.h>
+#include <ATen/ops/silu_meta.h>
+#include <ATen/ops/silu_backward_meta.h>
+#include <ATen/ops/sin_meta.h>
+#include <ATen/ops/sinc_meta.h>
+#include <ATen/ops/sinh_meta.h>
+#include <ATen/ops/size_meta.h>
+#include <ATen/ops/slice_meta.h>
+#include <ATen/ops/slice_backward_meta.h>
+#include <ATen/ops/slice_copy_meta.h>
+#include <ATen/ops/slice_inverse_meta.h>
+#include <ATen/ops/slice_scatter_meta.h>
+#include <ATen/ops/slogdet_meta.h>
+#include <ATen/ops/slow_conv3d_meta.h>
+#include <ATen/ops/slow_conv3d_forward_meta.h>
+#include <ATen/ops/slow_conv_dilated2d_meta.h>
+#include <ATen/ops/slow_conv_dilated3d_meta.h>
+#include <ATen/ops/slow_conv_transpose2d_meta.h>
+#include <ATen/ops/slow_conv_transpose3d_meta.h>
+#include <ATen/ops/smm_meta.h>
+#include <ATen/ops/smooth_l1_loss_meta.h>
+#include <ATen/ops/smooth_l1_loss_backward_meta.h>
+#include <ATen/ops/soft_margin_loss_meta.h>
+#include <ATen/ops/soft_margin_loss_backward_meta.h>
+#include <ATen/ops/softmax_meta.h>
+#include <ATen/ops/softplus_meta.h>
+#include <ATen/ops/softplus_backward_meta.h>
+#include <ATen/ops/softshrink_meta.h>
+#include <ATen/ops/softshrink_backward_meta.h>
+#include <ATen/ops/sort_meta.h>
+#include <ATen/ops/sparse_bsc_tensor_meta.h>
+#include <ATen/ops/sparse_bsr_tensor_meta.h>
+#include <ATen/ops/sparse_compressed_tensor_meta.h>
+#include <ATen/ops/sparse_coo_tensor_meta.h>
+#include <ATen/ops/sparse_csc_tensor_meta.h>
+#include <ATen/ops/sparse_csr_tensor_meta.h>
+#include <ATen/ops/sparse_dim_meta.h>
+#include <ATen/ops/sparse_mask_meta.h>
+#include <ATen/ops/sparse_resize_meta.h>
+#include <ATen/ops/sparse_resize_and_clear_meta.h>
+#include <ATen/ops/sparse_sampled_addmm_meta.h>
+#include <ATen/ops/special_airy_ai_meta.h>
+#include <ATen/ops/special_bessel_j0_meta.h>
+#include <ATen/ops/special_bessel_j1_meta.h>
+#include <ATen/ops/special_bessel_y0_meta.h>
+#include <ATen/ops/special_bessel_y1_meta.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_meta.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_meta.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_meta.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_meta.h>
+#include <ATen/ops/special_digamma_meta.h>
+#include <ATen/ops/special_entr_meta.h>
+#include <ATen/ops/special_erf_meta.h>
+#include <ATen/ops/special_erfc_meta.h>
+#include <ATen/ops/special_erfcx_meta.h>
+#include <ATen/ops/special_erfinv_meta.h>
+#include <ATen/ops/special_exp2_meta.h>
+#include <ATen/ops/special_expit_meta.h>
+#include <ATen/ops/special_expm1_meta.h>
+#include <ATen/ops/special_gammainc_meta.h>
+#include <ATen/ops/special_gammaincc_meta.h>
+#include <ATen/ops/special_gammaln_meta.h>
+#include <ATen/ops/special_hermite_polynomial_h_meta.h>
+#include <ATen/ops/special_hermite_polynomial_he_meta.h>
+#include <ATen/ops/special_i0_meta.h>
+#include <ATen/ops/special_i0e_meta.h>
+#include <ATen/ops/special_i1_meta.h>
+#include <ATen/ops/special_i1e_meta.h>
+#include <ATen/ops/special_laguerre_polynomial_l_meta.h>
+#include <ATen/ops/special_legendre_polynomial_p_meta.h>
+#include <ATen/ops/special_log1p_meta.h>
+#include <ATen/ops/special_log_ndtr_meta.h>
+#include <ATen/ops/special_log_softmax_meta.h>
+#include <ATen/ops/special_logit_meta.h>
+#include <ATen/ops/special_logsumexp_meta.h>
+#include <ATen/ops/special_modified_bessel_i0_meta.h>
+#include <ATen/ops/special_modified_bessel_i1_meta.h>
+#include <ATen/ops/special_modified_bessel_k0_meta.h>
+#include <ATen/ops/special_modified_bessel_k1_meta.h>
+#include <ATen/ops/special_multigammaln_meta.h>
+#include <ATen/ops/special_ndtr_meta.h>
+#include <ATen/ops/special_ndtri_meta.h>
+#include <ATen/ops/special_polygamma_meta.h>
+#include <ATen/ops/special_psi_meta.h>
+#include <ATen/ops/special_round_meta.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_meta.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_meta.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_meta.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_meta.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_meta.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_meta.h>
+#include <ATen/ops/special_sinc_meta.h>
+#include <ATen/ops/special_softmax_meta.h>
+#include <ATen/ops/special_spherical_bessel_j0_meta.h>
+#include <ATen/ops/special_xlog1py_meta.h>
+#include <ATen/ops/special_xlogy_meta.h>
+#include <ATen/ops/special_zeta_meta.h>
+#include <ATen/ops/split_meta.h>
+#include <ATen/ops/split_copy_meta.h>
+#include <ATen/ops/split_with_sizes_meta.h>
+#include <ATen/ops/split_with_sizes_copy_meta.h>
+#include <ATen/ops/sqrt_meta.h>
+#include <ATen/ops/square_meta.h>
+#include <ATen/ops/squeeze_meta.h>
+#include <ATen/ops/squeeze_copy_meta.h>
+#include <ATen/ops/sspaddmm_meta.h>
+#include <ATen/ops/stack_meta.h>
+#include <ATen/ops/std_meta.h>
+#include <ATen/ops/std_mean_meta.h>
+#include <ATen/ops/stft_meta.h>
+#include <ATen/ops/stride_meta.h>
+#include <ATen/ops/sub_meta.h>
+#include <ATen/ops/subtract_meta.h>
+#include <ATen/ops/sum_meta.h>
+#include <ATen/ops/sum_to_size_meta.h>
+#include <ATen/ops/svd_meta.h>
+#include <ATen/ops/swapaxes_meta.h>
+#include <ATen/ops/swapdims_meta.h>
+#include <ATen/ops/sym_constrain_range_meta.h>
+#include <ATen/ops/sym_constrain_range_for_size_meta.h>
+#include <ATen/ops/sym_is_contiguous_meta.h>
+#include <ATen/ops/sym_numel_meta.h>
+#include <ATen/ops/sym_size_meta.h>
+#include <ATen/ops/sym_storage_offset_meta.h>
+#include <ATen/ops/sym_stride_meta.h>
+#include <ATen/ops/t_meta.h>
+#include <ATen/ops/t_copy_meta.h>
+#include <ATen/ops/take_meta.h>
+#include <ATen/ops/take_along_dim_meta.h>
+#include <ATen/ops/tan_meta.h>
+#include <ATen/ops/tanh_meta.h>
+#include <ATen/ops/tanh_backward_meta.h>
+#include <ATen/ops/tensor_split_meta.h>
+#include <ATen/ops/tensordot_meta.h>
+#include <ATen/ops/thnn_conv2d_meta.h>
+#include <ATen/ops/threshold_meta.h>
+#include <ATen/ops/threshold_backward_meta.h>
+#include <ATen/ops/tile_meta.h>
+#include <ATen/ops/to_meta.h>
+#include <ATen/ops/to_dense_meta.h>
+#include <ATen/ops/to_dense_backward_meta.h>
+#include <ATen/ops/to_mkldnn_meta.h>
+#include <ATen/ops/to_mkldnn_backward_meta.h>
+#include <ATen/ops/to_padded_tensor_meta.h>
+#include <ATen/ops/to_sparse_meta.h>
+#include <ATen/ops/to_sparse_bsc_meta.h>
+#include <ATen/ops/to_sparse_bsr_meta.h>
+#include <ATen/ops/to_sparse_csc_meta.h>
+#include <ATen/ops/to_sparse_csr_meta.h>
+#include <ATen/ops/topk_meta.h>
+#include <ATen/ops/trace_meta.h>
+#include <ATen/ops/trace_backward_meta.h>
+#include <ATen/ops/transpose_meta.h>
+#include <ATen/ops/transpose_copy_meta.h>
+#include <ATen/ops/trapezoid_meta.h>
+#include <ATen/ops/trapz_meta.h>
+#include <ATen/ops/triangular_solve_meta.h>
+#include <ATen/ops/tril_meta.h>
+#include <ATen/ops/tril_indices_meta.h>
+#include <ATen/ops/triplet_margin_loss_meta.h>
+#include <ATen/ops/triu_meta.h>
+#include <ATen/ops/triu_indices_meta.h>
+#include <ATen/ops/true_divide_meta.h>
+#include <ATen/ops/trunc_meta.h>
+#include <ATen/ops/type_as_meta.h>
+#include <ATen/ops/unbind_meta.h>
+#include <ATen/ops/unbind_copy_meta.h>
+#include <ATen/ops/unflatten_meta.h>
+#include <ATen/ops/unflatten_dense_tensors_meta.h>
+#include <ATen/ops/unfold_meta.h>
+#include <ATen/ops/unfold_backward_meta.h>
+#include <ATen/ops/unfold_copy_meta.h>
+#include <ATen/ops/uniform_meta.h>
+#include <ATen/ops/unique_consecutive_meta.h>
+#include <ATen/ops/unique_dim_meta.h>
+#include <ATen/ops/unique_dim_consecutive_meta.h>
+#include <ATen/ops/unsafe_chunk_meta.h>
+#include <ATen/ops/unsafe_split_meta.h>
+#include <ATen/ops/unsafe_split_with_sizes_meta.h>
+#include <ATen/ops/unsqueeze_meta.h>
+#include <ATen/ops/unsqueeze_copy_meta.h>
+#include <ATen/ops/upsample_bicubic2d_meta.h>
+#include <ATen/ops/upsample_bicubic2d_backward_meta.h>
+#include <ATen/ops/upsample_bilinear2d_meta.h>
+#include <ATen/ops/upsample_bilinear2d_backward_meta.h>
+#include <ATen/ops/upsample_linear1d_meta.h>
+#include <ATen/ops/upsample_linear1d_backward_meta.h>
+#include <ATen/ops/upsample_nearest1d_meta.h>
+#include <ATen/ops/upsample_nearest1d_backward_meta.h>
+#include <ATen/ops/upsample_nearest2d_meta.h>
+#include <ATen/ops/upsample_nearest2d_backward_meta.h>
+#include <ATen/ops/upsample_nearest3d_meta.h>
+#include <ATen/ops/upsample_nearest3d_backward_meta.h>
+#include <ATen/ops/upsample_trilinear3d_meta.h>
+#include <ATen/ops/upsample_trilinear3d_backward_meta.h>
+#include <ATen/ops/value_selecting_reduction_backward_meta.h>
+#include <ATen/ops/values_meta.h>
+#include <ATen/ops/values_copy_meta.h>
+#include <ATen/ops/vander_meta.h>
+#include <ATen/ops/var_meta.h>
+#include <ATen/ops/var_mean_meta.h>
+#include <ATen/ops/vdot_meta.h>
+#include <ATen/ops/view_meta.h>
+#include <ATen/ops/view_as_meta.h>
+#include <ATen/ops/view_as_complex_meta.h>
+#include <ATen/ops/view_as_complex_copy_meta.h>
+#include <ATen/ops/view_as_real_meta.h>
+#include <ATen/ops/view_as_real_copy_meta.h>
+#include <ATen/ops/view_copy_meta.h>
+#include <ATen/ops/vsplit_meta.h>
+#include <ATen/ops/vstack_meta.h>
+#include <ATen/ops/where_meta.h>
+#include <ATen/ops/xlogy_meta.h>
+#include <ATen/ops/xor_meta.h>
+#include <ATen/ops/zero_meta.h>
+#include <ATen/ops/zeros_meta.h>
+#include <ATen/ops/zeros_like_meta.h>
+namespace at {
+namespace meta {
+} // namespace meta
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/NestedTensorImpl.h ADDED Viewed

	@@ -0,0 +1,292 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/MemoryOverlap.h>
+#include <ATen/Tensor.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/irange.h>
+namespace at::native {
+struct NestedTensorImpl;
+inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt);
+int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor);
+at::Tensor construct_nested_strides(const at::Tensor& nested_size);
+at::Tensor construct_offsets(const at::Tensor& nested_size);
+struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
+  explicit NestedTensorImpl(
+      Storage storage,
+      c10::DispatchKeySet key_set,
+      const caffe2::TypeMeta data_type,
+      at::Tensor nested_sizes,
+      at::Tensor nested_strides,
+      at::Tensor storage_offsets);
+  explicit NestedTensorImpl(
+      const at::Tensor& buffer,
+      at::Tensor nested_sizes,
+      at::Tensor nested_strides,
+      at::Tensor storage_offsets);
+  // assume contiguous, `nested_strides` and `offsets`
+  // can be inferred from `nested_sizes`
+  explicit NestedTensorImpl(
+      const at::Tensor& buffer,
+      const at::Tensor& nested_sizes);
+  // This constructor is used creating view tensors from nested tensors
+  explicit NestedTensorImpl(
+      c10::TensorImpl::ImplType impl_type,
+      const at::Tensor& base_tensor,
+      at::Tensor nested_sizes,
+      at::Tensor nested_strides,
+      at::Tensor storage_offsets);
+  // TODO: don't expose private implementation details like this; in
+  // particular, resizing this tensor will mess up our dim() and
+  // callers cannot fix it.
+  const Tensor& get_nested_sizes() const {
+    return nested_sizes_;
+  }
+  // TODO: don't expose private implementation details like this
+  const Tensor& get_nested_strides() const {
+    return nested_strides_;
+  }
+  const Tensor& get_storage_offsets() const {
+    return storage_offsets_;
+  }
+  // Returns nullopt if the ith dimension is irregular. The ith dimension
+  // of a NestedTensor is regular if the unbound tensors match in
+  // size at the (i-1)th dimension.
+  std::optional<int64_t> opt_size(int64_t d) const;
+  int64_t size(int64_t d) const {
+    std::optional<int64_t> optional_size = this->opt_size(d);
+    TORCH_CHECK(
+        optional_size.has_value(),
+        "Given dimension ",
+        d,
+        " is irregular and does not have a size.");
+    return *optional_size;
+  }
+  /**
+   * Return a view of the nested tensor as a 1 dimensional contiguous tensor.
+   *
+   * The buffer tensor created by this function shares the same storage_impl as
+   * the original nested tensor, and therefore can be seen as a view.
+   *
+   * @return A newly constructed view tensor
+   */
+  at::Tensor get_buffer() const {
+    TORCH_CHECK(
+        nested_tensor_impl_is_contiguous(this),
+        "NestedTensor must be contiguous to get buffer.");
+    return get_unsafe_storage_as_tensor();
+  }
+  /**
+   * If possible use get_buffer() instead. This function returns the storage
+   * as a tensor directly, which is not safe to use in general. If using this
+   * function, The caller must ensure to account for nested_sizes,
+   * nested_strides and storage_offsets.
+   *
+   * @return A newly constructed view tensor
+   */
+  at::Tensor get_unsafe_storage_as_tensor() const {
+    auto buffer_key_set_ = generate_buffer_key_set();
+    const auto buffer_size = get_buffer_size();
+    auto buffer_tensor_impl = c10::make_intrusive<TensorImpl>(
+        c10::TensorImpl::VIEW, Storage(storage_), buffer_key_set_, data_type_);
+    buffer_tensor_impl->set_sizes_contiguous(
+        c10::makeArrayRef(static_cast<int64_t>(buffer_size)));
+    return Tensor(buffer_tensor_impl);
+  }
+  size_t get_buffer_size() const {
+    return storage_.nbytes() / data_type_.itemsize();
+  }
+ protected:
+  const char* tensorimpl_type_name() const override;
+  // TODO: numel_custom and is_contiguous_custom can be profitably overridden
+  // with real implementations
+  int64_t numel_custom() const override;
+  c10::SymInt sym_numel_custom() const override;
+  c10::SymBool sym_is_contiguous_custom(
+      MemoryFormat /*memory_format*/) const override;
+  int64_t size_custom(int64_t d) const override {
+    return this->size(d);
+  }
+  c10::SymInt sym_size_custom(int64_t d) const override {
+    return c10::SymInt{this->size(d)};
+  }
+  IntArrayRef sizes_custom() const override;
+  c10::SymIntArrayRef sym_sizes_custom() const override;
+  IntArrayRef strides_custom() const override;
+  c10::SymIntArrayRef sym_strides_custom() const override;
+  // this one is real
+  int64_t dim_custom() const override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override;
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override {
+    copy_tensor_metadata(
+        /*src_impl=*/impl.get(),
+        /*dest_impl=*/this,
+        /*version_counter=*/version_counter(),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
+  }
+ private:
+  // Must be called after any changes to our dim() to sync the state
+  // to TensorImpl.
+  void refresh_dim();
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::Tensor nested_sizes_, nested_strides_;
+  // The starting positions of the underlying tensors in contiguous buffer
+  // i.e. the buffer memory offsets to get the underlying tensors
+  // The reason to keep this metadata is that, without strong enough constraint
+  // it cannot be derived from `nested_sizes_`
+  // and `nested_strides_`:
+  // 1. when buffer has blanks, e.g. [tensor1, blank, tensor2]
+  //    this can happen e.g. after slicing a nested tensor
+  // 2. when multiple tensors share a same memory
+  // 3. when the nesting ordering is changed, e.g. [tensor1, tensor3, tensor2]
+  // Some strong enough constraints are:
+  // 1. every underlying tensor is contiguous in memory
+  //    && nesting in ascending order
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::Tensor storage_offsets_;
+  // NOTE: -1 here means the size is missing
+  // Optional to allow it to be computed lazily from nested.
+  // TODO: maybe we can remove this metadata since
+  //       we can compute it from `nested_sizes_`
+  mutable std::optional<std::vector<int64_t>> opt_sizes_;
+  template <typename VariableVersion>
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach_core(
+      VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const;
+  /**
+   * Generates a non-nested key_set from a nested tensor.
+   *
+   * For many nested tensor kernel implementations a buffer tensor
+   * is generated and redispatched to a non-nested kernel this function
+   * generates the key set used by that buffer tensor
+   *
+   * @return Appropriate key set for non-nested tensor
+   */
+  inline c10::DispatchKeySet generate_buffer_key_set() const {
+    auto buffer_key_set = this->key_set();
+    const bool Autograd = buffer_key_set.has_any(c10::autograd_dispatch_keyset);
+    // Remove nested tensor specific keys
+    buffer_key_set = buffer_key_set -
+        c10::DispatchKeySet{
+            c10::DispatchKey::NestedTensor,
+            c10::DispatchKey::AutogradNestedTensor};
+    // Add dense tensor specific keys
+    buffer_key_set =
+        buffer_key_set | c10::DispatchKeySet{c10::DispatchKey::Dense};
+    buffer_key_set = Autograd
+        ? c10::DispatchKeySet{c10::DispatchKey::Autograd} | buffer_key_set
+        : buffer_key_set;
+    return buffer_key_set;
+  }
+};
+inline NestedTensorImpl* get_nested_tensor_impl_or_null(
+    const at::Tensor& tensor) {
+  if (tensor.is_nested()) {
+    return static_cast<NestedTensorImpl*>(tensor.unsafeGetTensorImpl());
+  }
+  return nullptr;
+}
+inline NestedTensorImpl* get_nested_tensor_impl(const at::Tensor& tensor) {
+  TORCH_CHECK(
+      tensor.is_nested(), "get_nested_tensor_impl requires a NestedTensor.");
+  return static_cast<NestedTensorImpl*>(tensor.unsafeGetTensorImpl());
+}
+inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) {
+  int64_t ntensors = nt->size(0);
+  if (ntensors == 0) {
+    return true;
+  }
+  const Tensor &sizemat = nt->get_nested_sizes(),
+               &stridemat = nt->get_nested_strides();
+  const int64_t* offsets_ptr =
+      nt->get_storage_offsets().const_data_ptr<int64_t>();
+  int64_t orig_dim = sizemat.size(1);
+  // nesting scalars
+  if (orig_dim == 0) {
+    // each scalar must be contiguous
+    // if there is blank memory between underlying scalars
+    for (int64_t i = 0; i < ntensors; i++) {
+      if (offsets_ptr[i] != i) {
+        return false;
+      }
+    }
+  }
+  // nesting tensors
+  else {
+    // if any underlying tensor is non-contiguous
+    const int64_t *sizemat_ptr = sizemat.const_data_ptr<int64_t>(),
+                  *stridemat_ptr = stridemat.const_data_ptr<int64_t>();
+    for (int64_t i = 0; i < ntensors; i++) {
+      if (stridemat_ptr[orig_dim - 1] != 1) {
+        return false;
+      }
+      int64_t product = sizemat_ptr[orig_dim - 1];
+      for (int64_t j = orig_dim - 2; j >= 0; j--) {
+        if (stridemat_ptr[j] != product) {
+          return false;
+        }
+        product *= sizemat_ptr[j];
+      }
+      sizemat_ptr += orig_dim;
+      stridemat_ptr += orig_dim;
+    }
+    // if there is blank memory between underlying tensors
+    if (offsets_ptr[0] != 0) {
+      return false;
+    }
+    sizemat_ptr = sizemat.const_data_ptr<int64_t>();
+    stridemat_ptr = stridemat.const_data_ptr<int64_t>();
+    for (int64_t i = 1; i < ntensors; i++) {
+      if (offsets_ptr[i] !=
+          offsets_ptr[i - 1] + *sizemat_ptr * *stridemat_ptr) {
+        return false;
+      }
+      sizemat_ptr += orig_dim;
+      stridemat_ptr += orig_dim;
+    }
+  }
+  // everything is fine
+  return true;
+}
+inline const at::Tensor& get_nested_sizes(const at::Tensor& tensor) {
+  return get_nested_tensor_impl(tensor)->get_nested_sizes();
+}
+} // namespace at::native
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/NumericUtils.h ADDED Viewed

	@@ -0,0 +1,208 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#ifdef __HIPCC__
+#include <hip/hip_runtime.h>
+#endif
+#include <c10/macros/Macros.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+#include <c10/util/complex.h>
+#include <cmath>
+#include <type_traits>
+namespace at {
+// std::isnan isn't performant to use on integral types; it will
+// (uselessly) convert to floating point and then do the test.
+// This function is.
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T /*val*/) {
+  return false;
+}
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return ::isnan(val);
+#else
+  return std::isnan(val);
+#endif
+}
+template <typename T, std::enable_if_t<c10::is_complex<T>::value, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return std::isnan(val.real()) || std::isnan(val.imag());
+}
+template <typename T, std::enable_if_t<std::is_same_v<T, at::Half>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return at::_isnan(static_cast<float>(val));
+}
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::BFloat16>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
+  return at::_isnan(static_cast<float>(val));
+}
+inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) {
+  return at::_isnan(static_cast<float>(val));
+}
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e5m2>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e4m3fn>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e5m2fnuz>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+template <
+    typename T,
+    std::enable_if_t<std::is_same_v<T, at::Float8_e4m3fnuz>, int> = 0>
+inline C10_HOST_DEVICE bool _isnan(T val) {
+  return val.isnan();
+}
+// std::isinf isn't performant to use on integral types; it will
+// (uselessly) convert to floating point and then do the test.
+// This function is.
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+inline C10_HOST_DEVICE bool _isinf(T /*val*/) {
+  return false;
+}
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
+inline C10_HOST_DEVICE bool _isinf(T val) {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  return ::isinf(val);
+#else
+  return std::isinf(val);
+#endif
+}
+inline C10_HOST_DEVICE bool _isinf(at::Half val) {
+  return at::_isinf(static_cast<float>(val));
+}
+inline C10_HOST_DEVICE bool _isinf(at::BFloat16 val) {
+  return at::_isinf(static_cast<float>(val));
+}
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e5m2 val) {
+  return val.isinf();
+}
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fn val [[maybe_unused]]) {
+  return false;
+}
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e5m2fnuz val [[maybe_unused]]) {
+  return false;
+}
+inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fnuz val [[maybe_unused]]) {
+  return false;
+}
+template <typename T>
+C10_HOST_DEVICE inline T exp(T x) {
+  static_assert(
+      !std::is_same_v<T, double>,
+      "this template must be used with float or less precise type");
+#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
+  // use __expf fast approximation for peak bandwidth
+  return __expf(x);
+#else
+  return ::exp(x);
+#endif
+}
+template <>
+C10_HOST_DEVICE inline double exp<double>(double x) {
+  return ::exp(x);
+}
+template <typename T>
+C10_HOST_DEVICE inline T log(T x) {
+  static_assert(
+      !std::is_same_v<T, double>,
+      "this template must be used with float or less precise type");
+#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
+  // use __logf fast approximation for peak bandwidth
+  return __logf(x);
+#else
+  return ::log(x);
+#endif
+}
+template <>
+C10_HOST_DEVICE inline double log<double>(double x) {
+  return ::log(x);
+}
+template <typename T>
+C10_HOST_DEVICE inline T log1p(T x) {
+  static_assert(
+      !std::is_same_v<T, double>,
+      "this template must be used with float or less precise type");
+#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
+  // use __logf fast approximation for peak bandwidth
+  // NOTE: There is no __log1pf so unfortunately we lose precision.
+  return __logf(1.0f + x);
+#else
+  return ::log1p(x);
+#endif
+}
+template <>
+C10_HOST_DEVICE inline double log1p<double>(double x) {
+  return ::log1p(x);
+}
+template <typename T>
+C10_HOST_DEVICE inline T tan(T x) {
+  static_assert(
+      !std::is_same_v<T, double>,
+      "this template must be used with float or less precise type");
+#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__)
+  // use __tanf fast approximation for peak bandwidth
+  return __tanf(x);
+#else
+  return ::tan(x);
+#endif
+}
+template <>
+C10_HOST_DEVICE inline double tan<double>(double x) {
+  return ::tan(x);
+}
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ParallelOpenMP.h ADDED Viewed

	@@ -0,0 +1,59 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <exception>
+#ifdef _OPENMP
+#define INTRA_OP_PARALLEL
+#include <omp.h>
+#endif
+#ifdef _OPENMP
+namespace at::internal {
+template <typename F>
+inline void invoke_parallel(
+    int64_t begin,
+    int64_t end,
+    int64_t grain_size,
+    const F& f) {
+  std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
+  std::exception_ptr eptr;
+#pragma omp parallel
+  {
+    // choose number of tasks based on grain size and number of threads
+    // can't use num_threads clause due to bugs in GOMP's thread pool (See
+    // #32008)
+    int64_t num_threads = omp_get_num_threads();
+    if (grain_size > 0) {
+      num_threads = std::min(num_threads, divup((end - begin), grain_size));
+    }
+    int64_t tid = omp_get_thread_num();
+    int64_t chunk_size = divup((end - begin), num_threads);
+    int64_t begin_tid = begin + tid * chunk_size;
+    if (begin_tid < end) {
+      try {
+        internal::ThreadIdGuard tid_guard(tid);
+        f(begin_tid, std::min(end, chunk_size + begin_tid));
+      } catch (...) {
+        if (!err_flag.test_and_set()) {
+          eptr = std::current_exception();
+        }
+      }
+    }
+  }
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  }
+}
+} // namespace at::internal
+#endif // _OPENMP
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/RedispatchFunctions.h ADDED Viewed

The diff for this file is too large to render. See raw diff

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/RegistrationDeclarations.h ADDED Viewed

The diff for this file is too large to render. See raw diff

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/SDPBackend.h ADDED Viewed

	@@ -0,0 +1,21 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <cstdint>
+namespace at {
+constexpr int32_t num_sdp_backends = 5;
+enum class SDPBackend {
+  error = -1,
+  math = 0,
+  flash_attention = 1,
+  efficient_attention = 2,
+  cudnn_attention = 3,
+  overrideable = 4
+};
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Scalar.h ADDED Viewed

	@@ -0,0 +1,8 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/Scalar.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/StorageUtils.h ADDED Viewed

	@@ -0,0 +1,54 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/Storage.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/util/intrusive_ptr.h>
+namespace at {
+class TensorBase;
+// Here we define a series of utils to create/manipulate ATen backed
+// c10 storage implementations.
+/**
+ * Create a new shared memory storage impl managed by file descriptor
+ *
+ * @param size  size in bytes
+ */
+C10_EXPORT c10::intrusive_ptr<c10::StorageImpl> new_shm_fd_storage(size_t size);
+/**
+ * Copy src to dst
+ * Caller must guarantee the validness of the storage objects
+ * during the entire copy process, esp. when it's async.
+ *
+ * This can probably live in c10 namespace later if needed,
+ * but for now keep it in at to keep implementation simple.
+ *
+ * @param dst  dst tensor
+ * @param src  src tensor
+ * @param non_blocking  (default false) whether this operation blocks caller
+ */
+C10_EXPORT void storage_copy(
+    c10::Storage& dst,
+    const c10::Storage& src,
+    bool non_blocking = false);
+/**
+ * In place change the storage to shm based.
+ *
+ * This is only applicable to CPU tensors not already shared.
+ * Otherwise, it's a no op to mirror the THP tensor behavior:
+ * https://pytorch.org/docs/stable/generated/torch.Tensor.share_memory_.html
+ *
+ * @param t  a tensor
+ */
+C10_EXPORT void share_memory_(TensorBase& t);
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/TensorAccessor.h ADDED Viewed

	@@ -0,0 +1,7 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/core/TensorAccessor.h>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ThreadLocalPythonObjects.h ADDED Viewed

	@@ -0,0 +1,26 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/SafePyObject.h>
+#include <c10/macros/Macros.h>
+#include <unordered_map>
+namespace at::impl {
+struct TORCH_API ThreadLocalPythonObjects {
+  static void set(const std::string& key, std::shared_ptr<SafePyObject> value);
+  static const std::shared_ptr<SafePyObject>& get(const std::string& key);
+  static bool contains(const std::string& key);
+  static const ThreadLocalPythonObjects& get_state();
+  static void set_state(ThreadLocalPythonObjects state);
+ private:
+  std::unordered_map<std::string, std::shared_ptr<c10::SafePyObject>> obj_dict_;
+};
+} // namespace at::impl
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/ThreadLocalState.h ADDED Viewed

	@@ -0,0 +1,131 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <c10/core/InferenceMode.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <ATen/FuncTorchTLS.h>
+#include <ATen/PythonTorchFunctionTLS.h>
+#include <ATen/SavedTensorHooks.h>
+#include <ATen/ThreadLocalPythonObjects.h>
+#include <ATen/record_function.h>
+#include <c10/core/impl/PythonDispatcherTLS.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+namespace at {
+// Thread local state contains values that are preserved across
+// thread boundaries (e.g. at::launch/JIT fork, autograd).
+// Note at::parallel_for doesn't preserve TLS across thread boundaries.
+class TORCH_API ThreadLocalState {
+ public:
+  // Saves the thread local variables' values and
+  // returns them as a ThreadLocalState
+  ThreadLocalState();
+  // set_grad_mode - force the value of the grad mode TLS in
+  //  the current state object. This is used for example in the
+  //  autograd engine.
+  void set_grad_mode(bool enabled);
+  // set_multithreading_enabled - force the value of the multithreadinmaximum
+  // threads TLS in
+  //  the current state object. This is used for example in the
+  //  autograd engine.
+  void set_multithreading_enabled(bool enabled);
+  // Sets thread local variables in the current thread,
+  // according to the thread boundary specified
+  static void setThreadLocalState(const ThreadLocalState& state);
+ private:
+  c10::impl::LocalDispatchKeySet dispatch_key_;
+  // ThreadLocalDebugInfo does not change after being created
+  // with DebugInfoGuard
+  std::shared_ptr<c10::ThreadLocalDebugInfo> debug_info_;
+  // RecordFunction TLS
+  RecordFunctionTLS rf_tls_;
+  // TLS for out-of-tree functorch
+  // See NOTE [functorch TLS in pytorch/pytorch] for why this needs to be a
+  // pointer (spoiler alert: it's due to the indirection)
+  // This needs to be a shared_ptr instead of a unique_ptr because
+  // ThreadLocalState is copy-able and does indeed get copied. Maybe we can
+  // consider adding an explicit copy constructor for ThreadLocalState in the
+  // future but I didn't want to add one just for this.
+  std::shared_ptr<const functorch::FuncTorchTLSBase> functorch_tls_;
+  // TLS for AutogradModes
+  AutogradState autograd_tls_;
+  // TLS for enable_torch_dispatch_mode
+  c10::impl::TorchDispatchModeTLS torch_dispatch_mode_state_;
+  // TLS for enable_python_dispatcher
+  c10::impl::PyInterpreter* python_dispatcher_state_;
+  // TLS for __torch_function__ (mode and disable_torch_function)
+  at::impl::PythonTorchFunctionTLS python_torch_function_state_;
+  // TLS for saved tensors default hooks
+  at::impl::SavedTensorDefaultHooksTLS saved_tensors_default_hooks_state_;
+  bool functionalization_reapply_views_state_;
+  bool dtensor_allow_implicit_replication_;
+  // TLS for arbitrary python objects that is registered via hooks
+  at::impl::ThreadLocalPythonObjects saved_objects_;
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && \
+    !defined(BUILD_LITE_INTERPRETER)
+  // TLS for autocast dtypes
+  std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+      autocast_dtypes_{};
+#endif
+  friend class ThreadLocalStateGuard;
+};
+// Guard to set and reset the thread local state
+class TORCH_API ThreadLocalStateGuard {
+ public:
+  explicit ThreadLocalStateGuard(const ThreadLocalState& state)
+      : prev_state_(ThreadLocalState()) {
+    // set the given state across the thread boundary
+    ThreadLocalState::setThreadLocalState(state);
+  }
+  ThreadLocalStateGuard(ThreadLocalStateGuard&& other) = delete;
+  ThreadLocalStateGuard(const ThreadLocalStateGuard&) = delete;
+  ThreadLocalStateGuard& operator=(const ThreadLocalStateGuard&) = delete;
+  ThreadLocalStateGuard& operator=(ThreadLocalStateGuard&&) = delete;
+  ~ThreadLocalStateGuard() {
+    // restore previously set variables
+    ThreadLocalState::setThreadLocalState(prev_state_);
+  }
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const ThreadLocalState prev_state_;
+};
+template <typename T>
+auto wrapPropagateTLSState(T callback) {
+  return [tls_state = ThreadLocalState(),
+          callback = std::move(callback)](auto&&... args) {
+    ThreadLocalStateGuard g(tls_state);
+    // Propagate value returned by callback().
+    return callback(std::forward<decltype(args)>(args)...);
+  };
+}
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/Utils.h ADDED Viewed

	@@ -0,0 +1,143 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+#include <ATen/EmptyTensor.h>
+#include <ATen/Formatting.h>
+#include <ATen/core/ATenGeneral.h>
+#include <ATen/core/Generator.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
+#include <algorithm>
+#define AT_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;         \
+  void operator=(const TypeName&) = delete
+namespace at {
+TORCH_API int _crash_if_asan(int /*arg*/);
+// Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
+// NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
+// Once cat is ported entirely to ATen this can be deleted!
+inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
+    ArrayRef<Tensor> tensors,
+    const char* name,
+    int pos,
+    c10::DeviceType device_type,
+    ScalarType scalar_type) {
+  std::vector<TensorImpl*> unwrapped;
+  unwrapped.reserve(tensors.size());
+  for (const auto i : c10::irange(tensors.size())) {
+    const auto& expr = tensors[i];
+    if (expr.layout() != Layout::Strided) {
+      TORCH_CHECK(
+          false,
+          "Expected dense tensor but got ",
+          expr.layout(),
+          " for sequence element ",
+          i,
+          " in sequence argument at position #",
+          pos,
+          " '",
+          name,
+          "'");
+    }
+    if (expr.device().type() != device_type) {
+      TORCH_CHECK(
+          false,
+          "Expected object of device type ",
+          device_type,
+          " but got device type ",
+          expr.device().type(),
+          " for sequence element ",
+          i,
+          " in sequence argument at position #",
+          pos,
+          " '",
+          name,
+          "'");
+    }
+    if (expr.scalar_type() != scalar_type) {
+      TORCH_CHECK(
+          false,
+          "Expected object of scalar type ",
+          scalar_type,
+          " but got scalar type ",
+          expr.scalar_type(),
+          " for sequence element ",
+          i,
+          " in sequence argument at position #",
+          pos,
+          " '",
+          name,
+          "'");
+    }
+    unwrapped.emplace_back(expr.unsafeGetTensorImpl());
+  }
+  return unwrapped;
+}
+template <size_t N>
+std::array<int64_t, N> check_intlist(
+    ArrayRef<int64_t> list,
+    const char* name,
+    int pos) {
+  if (list.empty()) {
+    // TODO: is this necessary?  We used to treat nullptr-vs-not in IntList
+    // differently with strides as a way of faking optional.
+    list = {};
+  }
+  auto res = std::array<int64_t, N>();
+  if (list.size() == 1 && N > 1) {
+    res.fill(list[0]);
+    return res;
+  }
+  if (list.size() != N) {
+    TORCH_CHECK(
+        false,
+        "Expected a list of ",
+        N,
+        " ints but got ",
+        list.size(),
+        " for argument #",
+        pos,
+        " '",
+        name,
+        "'");
+  }
+  std::copy_n(list.begin(), N, res.begin());
+  return res;
+}
+using at::detail::check_size_nonnegative;
+namespace detail {
+template <typename T>
+TORCH_API Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options);
+template <typename T>
+TORCH_API Tensor
+tensor_backend(ArrayRef<T> values, const TensorOptions& options);
+template <typename T>
+TORCH_API Tensor
+tensor_complex_cpu(ArrayRef<T> values, const TensorOptions& options);
+template <typename T>
+TORCH_API Tensor
+tensor_complex_backend(ArrayRef<T> values, const TensorOptions& options);
+} // namespace detail
+} // namespace at
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/ATen/cpp_custom_type_hack.h ADDED Viewed

	@@ -0,0 +1,115 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// YOU ARE IN THE WRONG PLACE! TURN BACK NOW!
+// This code was a temporary hack to enable embedding arbitrary C++ structures
+// into Tensors. THIS IS UNSAFE AND IS NOT SUPPORTED. IF YOU USE THIS CODE,
+// IT __WILL__ BREAK.
+// This code has been superseded by custom classes:
+// https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html
+// Please use custom classes and **DO NOT ADD MORE CALLSITES TO THINGS DEFINED
+// IN THIS FILE**.
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+// STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP STOP
+#include <ATen/TracerMode.h>
+#include <ATen/core/Tensor.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+namespace at::cpp_custom_type_hack {
+template <typename T>
+[[deprecated(
+    "Use custom classes instead: "
+    "https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html")]] bool
+isa(const Tensor& packed) {
+  return (packed.scalar_type() == kByte) &&
+      (packed.storage().data_ptr().get_deleter() ==
+       caffe2::TypeMeta::Make<T>().deleteFn());
+}
+template <typename T>
+[[deprecated(
+    "Use custom classes instead: "
+    "https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html")]] T&
+cast(const Tensor& packed) {
+  TORCH_CHECK(
+      packed.scalar_type() == kByte, "Expected temporary cpp type wrapper");
+  TORCH_CHECK(
+      packed.storage().data_ptr().get_deleter() ==
+          caffe2::TypeMeta::Make<T>().deleteFn(),
+      "Expected temporary cpp type wrapper of type ",
+      caffe2::TypeMeta::TypeName<T>());
+  return *reinterpret_cast<T*>(packed.storage().data_ptr().get());
+}
+template <typename T>
+[[deprecated(
+    "Use custom classes instead: "
+    "https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html")]] Tensor
+create(std::unique_ptr<T> ptr, TensorOptions options) {
+  // None of this should trace, so turn off Tracer dispatching
+  at::AutoDispatchBelowADInplaceOrView guard; // TODO: remove
+  at::tracer::impl::NoTracerDispatchMode tracer_guard;
+  // We store this instance away in a Tensor and register a deleter function
+  // so that we do not leak memory. On the other side, we pull out the storage's
+  // data_ptr and get the right typed pointer.
+  void* raw_ptr = ptr.release();
+  at::DataPtr at_ptr(
+      raw_ptr, raw_ptr, caffe2::TypeMeta::Make<T>().deleteFn(), at::kCPU);
+  // size doesn't really matter, but we can align it to the actual size
+  // returning variables because one likely want to use this hack from python
+  auto retval = at::empty({sizeof(T)}, options.device(kCPU).dtype(at::kByte));
+  retval.storage().set_data_ptr_noswap(std::move(at_ptr));
+  return retval;
+}
+} // namespace at::cpp_custom_type_hack
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/THC/THCAtomics.cuh ADDED Viewed

	@@ -0,0 +1,8 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// TODO: Remove once torchvision has been updated to use the ATen header
+#include <ATen/cuda/Atomic.cuh>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/THC/THCDeviceUtils.cuh ADDED Viewed

	@@ -0,0 +1,8 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// TODO: Remove this header
+#include <ATen/cuda/DeviceUtils.cuh>
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/ConvUtils.h ADDED Viewed

	@@ -0,0 +1,195 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <array>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+namespace fbgemm {
+template <int N, int... Vals>
+constexpr std::enable_if_t<N == sizeof...(Vals), std::array<int, N>>
+array_of_ones() {
+  return std::array<int, N>{{Vals...}};
+}
+template <int N, int... Vals>
+constexpr std::enable_if_t<N != sizeof...(Vals), std::array<int, N>>
+array_of_ones() {
+  return array_of_ones<N, Vals..., 1>();
+}
+template <int N, int... Vals>
+constexpr std::enable_if_t<N == sizeof...(Vals), std::array<int, N>>
+array_of_zeroes() {
+  return std::array<int, N>{{Vals...}};
+}
+template <int N, int... Vals>
+constexpr std::enable_if_t<N != sizeof...(Vals), std::array<int, N>>
+array_of_zeroes() {
+  return array_of_zeroes<N, Vals..., 0>();
+}
+/**
+ * @brief A struct to conveniently store all convolution parameters.
+ */
+template <int SPATIAL_DIM = 2>
+struct conv_param_t {
+  int MB; ///< Mini Batch size
+  int IC; ///< Number of Input Channels
+  int OC; ///< Number of Output Channels
+  std::array<int, SPATIAL_DIM> IN_DIM; ///< Input Image Dimension
+  int G; ///< Number of Groups
+  std::array<int, SPATIAL_DIM> K; ///< Filter (Kernel) dimensions
+  std::array<int, SPATIAL_DIM> stride; //< Strides
+  std::array<int, SPATIAL_DIM * 2>
+      pad; //< Padding (first SPATIAL_DIM is for prev/top/left padding, second
+           // SPATIAL_DIM is for next/bottom/right padding)
+  std::array<int, SPATIAL_DIM> dilation; //< Kernel dilation
+  // The following are derived parameters
+  std::array<int, SPATIAL_DIM> OUT_DIM; //< Output Image Dimension
+  std::array<int, SPATIAL_DIM> IN_DIMP; //< Input Image Dimension Padded
+  // The following is for tranposed convolution
+  std::array<int, SPATIAL_DIM>
+      output_pad; //< Padding (next/bottom/right padding in output buffer)
+  bool transposed;
+  /**
+   * @brief Constructor for initializing the convolution parameters.
+   */
+  conv_param_t(
+      int mb,
+      int ic,
+      int oc,
+      std::array<int, SPATIAL_DIM> in_dim,
+      int g,
+      std::array<int, SPATIAL_DIM> k,
+      std::array<int, SPATIAL_DIM> strd,
+      std::array<int, SPATIAL_DIM * 2> pd,
+      std::array<int, SPATIAL_DIM> dilations = array_of_ones<SPATIAL_DIM>(),
+      std::array<int, SPATIAL_DIM> otpt_pd = array_of_zeroes<SPATIAL_DIM>(),
+      bool transposed = false)
+      : MB(mb),
+        IC(ic),
+        OC(oc),
+        IN_DIM(in_dim),
+        G(g),
+        K(k),
+        stride(strd),
+        pad(pd),
+        dilation(dilations),
+        output_pad(otpt_pd),
+        transposed(transposed) {
+    if (ic % g != 0) {
+      throw std::runtime_error(
+          "groups = " + std::to_string(g) +
+          " does not divide number of input channels = " + std::to_string(ic));
+    }
+    if (oc % g != 0) {
+      throw std::runtime_error(
+          "groups = " + std::to_string(g) +
+          " does not divide number of output channels = " + std::to_string(oc));
+    }
+    for (int d = 0; d < SPATIAL_DIM; ++d) {
+      if (transposed) {
+        this->IN_DIMP[d] = this->IN_DIM[d] +
+            (this->dilation[d] * (this->K[d] - 1) - this->pad[d]) +
+            (this->dilation[d] * (this->K[d] - 1) - this->pad[SPATIAL_DIM + d]);
+        this->OUT_DIM[d] = (this->IN_DIM[d] - 1) * this->stride[d] -
+            this->pad[d] - this->pad[SPATIAL_DIM + d] +
+            this->dilation[d] * (this->K[d] - 1) + output_pad[d] + 1;
+      } else {
+        IN_DIMP[d] = IN_DIM[d] + pad[d] + pad[SPATIAL_DIM + d];
+        OUT_DIM[d] =
+            (IN_DIMP[d] - dilation[d] * (K[d] - 1) - 1) / stride[d] + 1;
+      }
+    }
+  }
+  /**
+   * @brief Helper function to get convolution parameters as string.
+   */
+  std::string toString() const {
+    std::string dim_string[3] = {"T", "H", "W"};
+    std::string out;
+    out += "MB:" + std::to_string(MB) + ", ";
+    out += "IC:" + std::to_string(IC) + ", ";
+    out += "OC:" + std::to_string(OC) + ", ";
+    if constexpr (SPATIAL_DIM <= 3) {
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "I" + dim_string[3 - SPATIAL_DIM + d] + ":" +
+            std::to_string(IN_DIM[d]) + ", ";
+      }
+    } else {
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "I" + std::to_string(d) + ":" + std::to_string(IN_DIM[d]) + ", ";
+      }
+    }
+    out += "G:" + std::to_string(G) + ", ";
+    if constexpr (SPATIAL_DIM <= 3) {
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "K" + dim_string[3 - SPATIAL_DIM + d] + ":" +
+            std::to_string(K[d]) + ", ";
+      }
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "stride_" + dim_string[3 - SPATIAL_DIM + d] + ":" +
+            std::to_string(stride[d]) + ", ";
+      }
+      for (int d = 0; d < SPATIAL_DIM * 2; ++d) {
+        out += "pad_" + dim_string[3 - SPATIAL_DIM + (d % SPATIAL_DIM)] + ":" +
+            std::to_string(pad[d]) + ", ";
+      }
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "dilation_" + dim_string[3 - SPATIAL_DIM + d] + ":" +
+            std::to_string(dilation[d]);
+        if (d < SPATIAL_DIM - 1) {
+          out += ", ";
+        }
+      }
+    } else {
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "K" + std::to_string(d) + ":" + std::to_string(K[d]) + ", ";
+      }
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "stride_" + std::to_string(d) + ":" + std::to_string(stride[d]) +
+            ", ";
+      }
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "pad_" + std::to_string(d) + ":" + std::to_string(pad[d]);
+        if (d < SPATIAL_DIM * 2 - 1) {
+          out += ", ";
+        }
+      }
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "dilation_" + std::to_string(d) + ":" +
+            std::to_string(dilation[d]) + ", ";
+      }
+    }
+    if (transposed) {
+      for (int d = 0; d < SPATIAL_DIM; ++d) {
+        out += "output_padding_" + std::to_string(d) + ":" +
+            std::to_string(output_pad[d]) + ", ";
+      }
+    }
+    return out;
+  }
+};
+} // namespace fbgemm
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/Fbgemm.h ADDED Viewed

	@@ -0,0 +1,1515 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+/**
+ * Top level include file for FBGEMM.
+ */
+#include <cassert>
+#include <memory>
+#include "./ConvUtils.h" // @manual
+#include "./FbgemmBuild.h" // @manual
+#include "./FbgemmEmbedding.h" // @manual
+#include "./FbgemmI8DepthwiseAvx2.h" // @manual
+#include "./FbgemmI8DirectconvAvx2.h" // @manual
+#include "./FbgemmI8Spmdm.h" // @manual
+#include "./FloatConversion.h" // @manual
+#include "./QuantUtilsAvx2.h" // @manual
+#include "./Types.h" // @manual
+#include "./Utils.h" // @manual
+// Turning on this option will print out time breakdown of each stage (e.g.,
+// input packing, the main GEMM kernel, each output processing pipeline).
+// Please note that currently this option won't report accurate timing if
+// multiple threads are used.
+// #define FBGEMM_MEASURE_TIME_BREAKDOWN
+#ifdef FBGEMM_MEASURE_TIME_BREAKDOWN
+#include <chrono>
+#include <iostream>
+extern double packing_time;
+extern double computing_time;
+extern double kernel_time;
+extern double postprocessing_time;
+extern double run_time;
+#endif
+namespace fbgemm {
+/**
+ * @brief Templatized struct for packing parameters for A and B matrices.
+ *
+ * @tparam T input type
+ * @tparam accT the type used for accumulation
+ * @tparam instSet anyarch/avx2/avx512
+ * @tparam int8Type an auxiliary template parameter to specialize for 8-bit
+ *                  input types.
+ */
+template <
+    typename T,
+    typename accT,
+    inst_set_t instSet,
+    typename int8Type = void>
+struct PackingTraits;
+// type specialized implementation in an include file
+#include "./PackingTraits-inl.h" // @manual
+/**
+ * @brief Base class for packing matrices for higher GEMM performance.
+ *
+ * Matrix is tiled into blockRows() * blockCols() blocks.
+ * Each block is with size blockRowSize() * blockColSize().
+ * This class is designed using CRTP
+ * (https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern)
+ *
+ * @tparam PT actual packing type, e.g., PackAWithRowOffset
+ */
+template <typename PT, typename inpType, typename accType = std::int32_t>
+class PackMatrix {
+ public:
+  PackMatrix() = delete; // no default constructor
+  PackMatrix(const PackMatrix&) = delete; // no copy
+  PackMatrix& operator=(const PackMatrix&) = delete; // no copy
+  PackMatrix(PackMatrix&&) = delete; // no move
+  PackMatrix& operator=(PackMatrix&& rhs) noexcept = delete; // no move
+  /**
+   * @param rows total number of rows in the matrix
+   *             (packed rows can be less than rows).
+   * @param cols total number of columns in the matrix
+   * @param pmat A buffer to contain the packed matrix.
+   *             If nullptr, a buffer owned by PackMatrix will be allocated
+   *             internally to contain the packed matrix.
+   *             For non-constant matrices like activation matrices, the client
+   *             code may want to pass a pre-allocated pmat to avoid the
+   *             overhead of internal memory allocation everytime a PackMatrix
+   *             is constructed. The client code can query how big patm should
+   *             be with packedBufferSize function.
+   * @param groups when groups > 1, we compute groups number of GEMMs each
+   *               multiplies A.rows by A.cols/A.groups matrix with
+   *               B.rows/B.groups by B.cols matrix (in conventional BLAS
+   *               terminology, this is a batched GEMM but we use the name group
+   *               to follow deep learning terminology). The result matrix has
+   *               dimension A.rows by B.cols*B.groups .
+   *               A.groups must be same as B.groups, A.groups must divide
+   *               A.cols, and B.groups must divide B.rows and C.cols.
+   */
+  PackMatrix(
+      std::int32_t rows,
+      std::int32_t cols,
+      inpType* pmat,
+      int groups = 1,
+      const BlockingFactors* params = nullptr);
+  /**
+   * @return true usually when the matrix is constant matrix (e.g., weight
+   *         matrices) that can be prepacked
+   */
+  bool isPrePacked() const {
+    return static_cast<const PT*>(this)->isPrePacked();
+  }
+  /**
+   * @return true if this is the first input matrix in GEMM (i.e., A in C = A *
+   *         B)
+   */
+  static bool isA() {
+    return PT::isA();
+  }
+  /**
+   * @brief The size of the buffer used for packing (The size is in number of
+   *        elements).
+   *
+   * rows and cols are only used for fully packing, i.e., for B matrix.  The
+   * client code can use this function to query how big the buffer used for
+   * packing should be.
+   */
+  static int packedBufferSize(
+      int rows = 0,
+      int cols = 0,
+      const BlockingFactors* params = nullptr);
+  FBGEMM_PUSH_WARNING_AND_DISABLE("-Wpragmas")
+  FBGEMM_PUSH_WARNING_AND_DISABLE("-Winfinite-recursion")
+  /**
+   * @return Pointer to a buffer containing row offset results. Some packing
+   *         objects fuse row offset computation for later requantization step.
+   */
+  std::int32_t* getRowOffsetBuffer() const {
+    return static_cast<const PT*>(this)->getRowOffsetBuffer();
+  }
+  /**
+   * @brief When k loop is also tiled/blocked, this function is used to check if
+   * have executed computations for the last k block so that we can perform
+   *        post-GEMM operations.
+   */
+  bool isThisLastKBlock(int block_id) const {
+    return static_cast<const PT*>(this)->isThisLastKBlock(block_id);
+  }
+  FBGEMM_POP_WARNING
+  FBGEMM_POP_WARNING
+  /**
+   * @brief Actual packing of a block of the source matrix in pmat buffer.
+   */
+  void pack(const block_type_t& block) {
+#if defined(FBGEMM_FBCODE) || !defined(__aarch64__)
+    static_cast<PT*>(this)->pack(block);
+#else
+    throw std::runtime_error("PackMatrix::pack() not implemented for aarch64");
+#endif // __aarch64__
+  }
+  std::int32_t numRows() const {
+    return nrows_;
+  }
+  std::int32_t numCols() const {
+    return ncols_;
+  }
+  /**
+   * @return The number of rows in each block
+   */
+  std::int32_t blockRowSize() const {
+    return brow_;
+  }
+  /**
+   * @return The number of columns in each block
+   */
+  std::int32_t blockColSize() const {
+    return bcol_;
+  }
+  /**
+   * @return The number of blocks along rows
+   */
+  std::int32_t blockRows() const {
+    return nbrow_;
+  }
+  /**
+   * @return The number of blocks along columns
+   */
+  std::int32_t blockCols() const {
+    return nbcol_;
+  }
+  /**
+   * @return The number of the rows in the currently packed block of a matrix.
+   *         For pre-packed (i.e., fully-packed), it's equal to the total number
+   * of rows.
+   */
+  std::int32_t numPackedRows() const {
+    return packedBlock_.row_size;
+  }
+  /**
+   * @return The number of columns in the currently packed block of a matrix.
+   *         For pre-packed (i.e., fully-packed), it's equal to the number of
+   * columns.
+   */
+  std::int32_t numPackedCols() const {
+    return packedBlock_.col_size;
+  }
+  /**
+   * @return The first row of the block we're working on.
+   */
+  std::int32_t packedRowStart() const {
+    return packedBlock_.row_start;
+  }
+  /**
+   * @return The first column of the block we're working on.
+   */
+  std::int32_t packedColStart() const {
+    return packedBlock_.col_start;
+  }
+  /**
+   * @return The beginning of (rowBlockNum, colBlockNum)th block
+   */
+  inpType* getBuf(std::int32_t rowBlockNum = 0, std::int32_t colBlockNum = 0) {
+    return buf_ + blockRowSize() * blockColSize() * rowBlockNum +
+        blockRowSize() * blockColSize() * blockCols() * colBlockNum;
+  }
+  /**
+   * @brief Print the packed block.
+   */
+  void printPackedMatrix(const std::string& name) {
+    static_cast<PT*>(this)->printPackedMatrix(name);
+  }
+  /**
+   * @return The number of rows in the last row block.
+   */
+  std::int32_t lastBrow() const {
+    return last_brow_;
+  }
+  /**
+   * @return The number of columns in the last column block.
+   */
+  std::int32_t lastBcol() const {
+    return last_bcol_;
+  }
+  int numGroups() const {
+    return G_;
+  }
+  /**
+   * @return True if the last column block has fewer columns than the block
+   *         size.
+   */
+  bool isThereColRemainder() const {
+    return last_bcol_ != blockColSize();
+  }
+  virtual ~PackMatrix() {
+    if (bufAllocatedHere_) {
+      fbgemmAlignedFree(buf_);
+    }
+  }
+ protected:
+  /**
+   * Set which block we're packing
+   */
+  void packedBlock(const block_type_t& block) {
+    packedBlock_ = block;
+    nbrow_ = (numPackedRows() + blockRowSize() - 1) / blockRowSize();
+    nbcol_ = (numPackedCols() + blockColSize() - 1) / blockColSize();
+    last_brow_ = ((numPackedRows() % blockRowSize()) == 0)
+        ? blockRowSize()
+        : (numPackedRows() % blockRowSize());
+    last_bcol_ = ((numPackedCols() % blockColSize()) == 0)
+        ? blockColSize()
+        : (numPackedCols() % blockColSize());
+  }
+  inpType* buf_;
+  std::int32_t brow_; ///< the number of rows in each block
+  std::int32_t bcol_; ///< the number of columns in each block
+  std::int32_t nbrow_; ///< the number of blocks along rows
+  std::int32_t nbcol_; ///< the number of blocks along columns
+  bool bufAllocatedHere_{false};
+  const BlockingFactors*
+      blocking_params; ///< MCB, KCB, NCB, MR, NR, NR_MIN, ROW_INTERLEAVE;
+ private:
+  std::int32_t nrows_, ncols_;
+  int G_;
+  block_type_t packedBlock_; ///< The block in the source matrix just packed
+  std::int32_t last_brow_, last_bcol_;
+};
+/**
+ * @brief Matrix packed for the first input matrix in GEMM (usually
+ *        activation).  The source matrix is already quantized. Default
+ * accumulation type is int32.
+ */
+template <typename T, typename accT = std::int32_t>
+class FBGEMM_API PackAMatrix final
+    : public PackMatrix<PackAMatrix<T, accT>, T, accT> {
+ public:
+  using This = PackAMatrix<T, accT>;
+  using BaseType = PackMatrix<This, T, accT>;
+  using inpType = T;
+  using accType = accT;
+  PackAMatrix() = delete; // no default constructor
+  PackAMatrix(
+      matrix_op_t trans,
+      std::int32_t nRow,
+      std::int32_t nCol,
+      const inpType* smat,
+      std::int32_t ld,
+      inpType* pmat = nullptr,
+      int groups = 1,
+      const BlockingFactors* params = nullptr);
+  /**
+   * Activation matrices are not constant so cannot amortize the cost of
+   * pre-packing.
+   */
+  bool isPrePacked() const {
+    return false;
+  }
+  /**
+   * @return True if this is used as A matrix.
+   */
+  static constexpr bool isA() {
+    return true;
+  }
+  /**
+   * @return A pointer to the row offset buffer. There is no row offset buffer
+   *         calculations with this packing class, hence, it returns nullptr.
+   */
+  std::int32_t* getRowOffsetBuffer() const {
+    return nullptr;
+  }
+  /**
+   * @return Offset of the element in the packed matrix that was at (i, j) in
+   *         the source matrix.
+   */
+  std::int32_t addr(std::int32_t i, std::int32_t j) const;
+  /**
+   * @brief Packs a block of source matrix into pmat buffer.
+   */
+  void pack(const block_type_t& block);
+  /**
+   * @brief Print the packed block.
+   */
+  void printPackedMatrix(const std::string& name);
+ private:
+  matrix_op_t trans_;
+  const T* smat_;
+  std::int32_t ld_;
+  std::int32_t row_interleave_B_;
+};
+/**
+ * @brief Matrix packed for the second input matrix in GEMM (usually weight).
+ *        The source matrix is already quantized. Default accumulation
+ *        type is int32.
+ */
+template <typename T, typename accT = std::int32_t>
+class FBGEMM_API PackBMatrix final
+    : public PackMatrix<PackBMatrix<T, accT>, T, accT> {
+ public:
+  using This = PackBMatrix<T, accT>;
+  using BaseType = PackMatrix<This, T, accT>;
+  using inpType = T;
+  using accType = accT;
+  PackBMatrix() = delete; // no default constructor
+  /**
+   * @param groups if > 1 and trans == NoTranspose, smat is nRow x nCol with
+   *               groups are vertically concatenated: each group is
+   *               (nRow / groups) x nCol .
+   *               if > 1 and trans == Transpose, smat is (nCol * groups) x
+   *               (nRow / groups) with groups are horizontally concatenated:
+   *               each group is nCol x (nRow / groups) . Each group is
+   *               transposed and vertically concatenated to match with the
+   *               NoTranspose case.
+   */
+  PackBMatrix(
+      matrix_op_t trans,
+      std::int32_t nRow,
+      std::int32_t nCol,
+      const inpType* smat,
+      std::int32_t ld,
+      inpType* pmat = nullptr,
+      int groups = 1,
+      const BlockingFactors* params = nullptr);
+  /**
+   * Weight matrices are usually constant so worth pre-packing.
+   */
+  bool isPrePacked() const {
+    return true;
+  }
+  /**
+   * @return True if to be used as A matrix, False otherwise.
+   */
+  static constexpr bool isA() {
+    return false;
+  }
+  /**
+   * @brief When k loop is also tiled/blocked, this function is used to check if
+   * have executed computations for the last k block so that we can perform
+   *        post-GEMM operations.
+   */
+  bool isThisLastKBlock(int block_id) const {
+    return (BaseType::blockRows() - 1) == block_id;
+  }
+  /**
+   * @return Offset of the element in the packed matrix that was at (i, j) in
+   *         the source matrix.
+   */
+  std::int32_t addr(std::int32_t i, std::int32_t j) const;
+  /**
+   * @brief Packs a block of source matrix into pmat buffer. The blocking
+   *        parameters are needed to compute the buffer size of each group.
+   *        It will use default blocking parameters if params is not provided.
+   */
+  void pack(const block_type_t& block, const BlockingFactors* params = nullptr);
+  /**
+   * @brief Print the packed block.
+   */
+  void printPackedMatrix(
+      const std::string& name,
+      const BlockingFactors* params = nullptr);
+  /**
+   * @return true if meta information like matrix shape is the same.
+   */
+  bool metaEquals(const PackBMatrix<T, accT>& that) const;
+  /**
+   * @return true if matrices are the same.
+   */
+  bool equals(const PackBMatrix<T, accT>& that) const;
+  /**
+   * @brief Unpack pmat buffer to the origin_buf (Used for the serialization to
+   * recover weight matrix).
+   */
+  void unpack(T* origin_buf, const BlockingFactors* params = nullptr);
+  ~PackBMatrix() override = default;
+ private:
+  matrix_op_t trans_;
+  const T* smat_;
+  std::int32_t ld_;
+  std::int32_t row_interleave_;
+  /**
+   * @brief Internal function performing both pack & unpack
+   */
+  void pack_unpack_(
+      const block_type_t& block,
+      T* unpack_buf,
+      T* pack_buf,
+      bool ispack,
+      const BlockingFactors* params = nullptr);
+};
+/**
+ * @brief Matrix packed for direct group convolution.
+ *        The source matrix is already quantized. Default accumulation
+ *        type is int32.
+ */
+template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = 2>
+class FBGEMM_API PackWeightMatrixForGConv {
+ public:
+  using This = PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>;
+  using inpType = T;
+  using accType = accT;
+  PackWeightMatrixForGConv() = delete; // no default constructor
+  PackWeightMatrixForGConv(const PackWeightMatrixForGConv&) = delete; // no copy
+  PackWeightMatrixForGConv& operator=(const PackWeightMatrixForGConv&) =
+      delete; // no copy
+  PackWeightMatrixForGConv(PackWeightMatrixForGConv&&) = delete; // no move
+  PackWeightMatrixForGConv& operator=(PackWeightMatrixForGConv&&) =
+      delete; // no move
+  /**
+   * @param pmat if nullptr, a buffer is allocated and owned by this class.
+   */
+  PackWeightMatrixForGConv(
+      matrix_op_t trans,
+      const conv_param_t<SPATIAL_DIM>& conv_param,
+      const inpType* sdata,
+      inpType* pdata = nullptr);
+  /**
+   * Number of groups we work at a time to fill the full simd width
+   * e.g., IC_PER_G = 4 and OC_PER_G = 4, we work on two groups at a time
+   * to fill the avx2 width of 256 bits.
+   */
+  static int numOfGroupsTogether(const conv_param_t<SPATIAL_DIM>& conv_param);
+  /**
+   * @brief Packs a block of source matrix into pmat buffer.
+   */
+  void pack();
+  /**
+   * @brief Unpacks a pmat buffer into source matrix.
+   */
+  void unpack(T* origin_buf);
+  /**
+   * @brief Return packed data
+   */
+  inpType* getBuf() {
+    return pdata_;
+  }
+  ~PackWeightMatrixForGConv() {
+    if (bufAllocatedHere_) {
+      fbgemmAlignedFree(pdata_);
+    }
+  }
+ private:
+  matrix_op_t trans_;
+  const conv_param_t<SPATIAL_DIM> conv_param_;
+  const T* sdata_;
+  T* pdata_;
+  bool bufAllocatedHere_{false};
+  // Number of groups we work at a time to fill the full simd width
+  int GTogether_;
+  /**
+   * @brief Internal function performing both pack & unpack
+   */
+  void pack_unpack_(const T* src, T* dst, bool ispack);
+  /**
+   * @brief Get the index of the unpacked data
+   */
+  int unpacked_index_(int t, int r, int s, int k, int g, int c, bool tr);
+  /**
+   * @brief Get the index of the packed data
+   */
+  int packed_index_(int t, int r, int s, int k, int g, int c);
+};
+/**
+ * @brief A container class to keep packed weight tensor for convolution.
+ *        The source tensor should already be quantized.
+ *
+ * @tparam SPATIAL_DIM is equal to 2 for 2D convolutions and 3 for 3D
+ *                     convolutions. Default value is 2.
+ * @tparam T is the datatype for source tensor. Default value is int8.
+ * @tparam accT is the datatype to accumulate into. Default value is int32.
+ */
+template <
+    int SPATIAL_DIM = 2,
+    typename T = std::int8_t,
+    typename accT = std::int32_t>
+class FBGEMM_API PackWeightsForConv {
+ public:
+  using This = PackWeightsForConv<SPATIAL_DIM, T, accT>;
+  using inpType = T;
+  using accType = accT;
+  PackWeightsForConv() = delete; // no default constructor
+  PackWeightsForConv(
+      const conv_param_t<SPATIAL_DIM>& conv_param,
+      const inpType* sdata,
+      const BlockingFactors* blocking_params = nullptr);
+  std::shared_ptr<PackBMatrix<T, accT>> getPackedWForIm2col() {
+    return W_im2col_packed_;
+  }
+#if defined(FBGEMM_FBCODE) || !defined(__aarch64__)
+  std::shared_ptr<PackedDepthWiseConvMatrix> getPackedWForDepthwise() {
+    return W_dw_packed_;
+  }
+#endif // __aarch64__
+  std::shared_ptr<PackedDirectConvMatrix> getPackedWForDirectconv() {
+    return W_dc_packed_;
+  }
+  std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
+  getPackedWForGroupwise() {
+    return W_gconv_packed_;
+  }
+  std::shared_ptr<PackBMatrix<T, accT>> getPackedWForPointwise() {
+    return W_pointwise_packed_;
+  }
+  int inputChannels() {
+    return conv_param_.IC;
+  }
+  int outputChannels() {
+    return conv_param_.OC;
+  }
+  std::array<int, SPATIAL_DIM> kernelDims() {
+    return conv_param_.K;
+  }
+  int groups() {
+    return conv_param_.G;
+  }
+  /**
+   * @brief Returns true if the packed weights would work for the given
+   * convolution parameters, and false otherwise
+   */
+  bool isPackingCompliant(const conv_param_t<SPATIAL_DIM>& conv_p);
+  /**
+   * @brief Returns a string of mismatching parameters
+   */
+  std::string mismatchingParams(const conv_param_t<SPATIAL_DIM>& conv_p);
+  /**
+   * @brief Unpack packed matric into origin_buf (Used for the serialization to
+   * recover weight matrix).
+   */
+  void unpack(T* origin_buf);
+ private:
+  const conv_param_t<SPATIAL_DIM> conv_param_;
+  // Packed weights if we use im2col based convolution implementation
+  std::shared_ptr<PackBMatrix<T, accT>> W_im2col_packed_;
+#if defined(FBGEMM_FBCODE) || !defined(__aarch64__)
+  // Packed weights if we use depthwise convolution implementation
+  std::shared_ptr<PackedDepthWiseConvMatrix> W_dw_packed_;
+#endif // __aarch64__
+  // Packed weights if we use direct convolution implementation
+  std::shared_ptr<PackedDirectConvMatrix> W_dc_packed_;
+  // Packed weights if we use groupwise (small channels per group) convolution
+  // implementation
+  std::shared_ptr<PackWeightMatrixForGConv<T, accT, SPATIAL_DIM>>
+      W_gconv_packed_;
+  // Packed weights if we use direct gemm for pointwise convolution
+  std::shared_ptr<PackBMatrix<T, accT>> W_pointwise_packed_;
+};
+/**
+ * @brief Matrix packed for the first input matrix in GEMM (usually activation),
+ *        and row offsets used for requantization is computed during packing.
+ *        Im2col is fused with packing here. The source matrix is already
+ * quantized.
+ */
+template <typename T, typename accT = std::int32_t, int SPATIAL_DIM = 2>
+class FBGEMM_API PackAWithIm2Col
+    : public PackMatrix<PackAWithIm2Col<T, accT, SPATIAL_DIM>, T, accT> {
+ public:
+  using This = PackAWithIm2Col<T, accT, SPATIAL_DIM>;
+  using BaseType = PackMatrix<This, T, accT>;
+  using inpType = T;
+  using accType = accT;
+  PackAWithIm2Col() = delete; // no default constructor
+  /**
+   * @param zero_pt the quantized value that maps to 0.0f floating-point number.
+   * @param row_offset If nullptr, this constructor internally allocates a
+   *                   buffer and owns it. Otherwise, this class doesn't own
+   *                   the buffer. The buffer will be populated when pack
+   *                   function is called.
+   * @param b_symmetric if true we skip row offset computation
+   */
+  PackAWithIm2Col(
+      const conv_param_t<SPATIAL_DIM>& conv_param,
+      const T* sdata,
+      inpType* pmat = nullptr,
+      std::int32_t a_zero_pt = 0,
+      std::int32_t* row_offset = nullptr,
+      bool b_symmetric = false,
+      const BlockingFactors* params = nullptr);
+  PackAWithIm2Col(const PackAWithIm2Col&) = delete;
+  PackAWithIm2Col(PackAWithIm2Col&&) = delete;
+  PackAWithIm2Col& operator=(const PackAWithIm2Col&) = delete;
+  PackAWithIm2Col& operator=(PackAWithIm2Col&&) = delete;
+  /**
+   * Activation matrices are not constant so cannot amortize the cost of
+   * pre-packing.
+   */
+  bool isPrePacked() const {
+    return false;
+  }
+  /**
+   * @return True if this is used as A matrix.
+   */
+  static constexpr bool isA() {
+    return true;
+  }
+  /**
+   * @brief Packs a block of source matrix into pmat buffer.
+   */
+  void pack(const block_type_t& block);
+  /**
+   * @return A pointer to the row offset buffer.
+   */
+  std::int32_t* getRowOffsetBuffer() const {
+    return row_offset_;
+  }
+  /**
+   * @brief Print the packed block.
+   */
+  void printPackedMatrix(const std::string& name);
+  /**
+   * @return Size of row offset buffer in number of elements
+   */
+  static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
+  ~PackAWithIm2Col() override {
+    if (rowOffsetAllocatedHere) {
+      fbgemmAlignedFree(row_offset_);
+    }
+  }
+ private:
+  const conv_param_t<SPATIAL_DIM> conv_p_;
+  const T* sdata_;
+  std::int32_t a_zero_pt_;
+  std::int32_t* row_offset_{nullptr};
+  bool rowOffsetAllocatedHere{false};
+  std::int32_t row_interleave_B_;
+};
+/**
+ * @brief Matrix packed for the first input matrix in GEMM (usually activation),
+ *        and row offsets used for requantization is computed during packing.
+ *        The source matrix is already quantized.
+ */
+template <typename T, typename accT = std::int32_t>
+class FBGEMM_API PackAWithRowOffset final
+    : public PackMatrix<PackAWithRowOffset<T, accT>, T, accT> {
+ public:
+  using This = PackAWithRowOffset<T, accT>;
+  using BaseType = PackMatrix<This, T, accT>;
+  using inpType = T;
+  using accType = accT;
+  PackAWithRowOffset() = delete; // no default constructor
+  /**
+   * @param row_offset If nullptr, this constructor internally allocates a
+   *                   buffer and owns it. Otherwise, this class doesn't own
+   *                   the buffer. The buffer will be populated when pack
+   *                   function is called.
+   */
+  PackAWithRowOffset(
+      matrix_op_t trans,
+      std::uint32_t nRow,
+      std::uint32_t nCol,
+      const T* smat,
+      std::uint32_t ld,
+      inpType* pmat = nullptr,
+      int groups = 1,
+      std::int32_t* row_offset = nullptr,
+      const BlockingFactors* params = nullptr);
+  PackAWithRowOffset(const PackAWithRowOffset&) = delete;
+  PackAWithRowOffset(PackAWithRowOffset&&) = delete;
+  PackAWithRowOffset& operator=(const PackAWithRowOffset&) = delete;
+  PackAWithRowOffset& operator=(PackAWithRowOffset&&) = delete;
+  /**
+   * Activation matrices are not constant so cannot amortize the cost of
+   * pre-packing.
+   */
+  bool isPrePacked() const {
+    return false;
+  }
+  /**
+   * @return True if this is used as A matrix.
+   */
+  static constexpr bool isA() {
+    return true;
+  }
+  /**
+   * @return Offset of the element in the packed matrix that was at (i, j) in
+   *         the source matrix
+   */
+  std::int32_t addr(std::int32_t i, std::int32_t j) const;
+  /**
+   * @brief Packs a block of source matrix into pmat buffer.
+   */
+  void pack(const block_type_t& block);
+  /**
+   * @return A pointer to the row offset buffer.
+   */
+  std::int32_t* getRowOffsetBuffer() const {
+    return row_offset_;
+  }
+  /**
+   * @brief Print the packed block.
+   */
+  void printPackedMatrix(const std::string& name);
+  /**
+   * @return size of row offset buffer in number of elements
+   */
+  static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
+  ~PackAWithRowOffset() override {
+    if (rowOffsetAllocatedHere) {
+      fbgemmAlignedFree(row_offset_);
+    }
+  }
+ private:
+  matrix_op_t trans_;
+  const T* smat_;
+  std::uint32_t ld_;
+  std::int32_t* row_offset_{nullptr};
+  bool rowOffsetAllocatedHere{false};
+  std::int32_t row_interleave_B_;
+};
+/**
+ * @brief Matrix packed for the first input matrix in GEMM (usually activation),
+ *        and row offsets used for requantization is computed during packing.
+ *        The source matrix is in fp32 and quantized during packing.
+ */
+template <typename T, typename accT = std::int32_t>
+class FBGEMM_API PackAWithQuantRowOffset final
+    : public PackMatrix<PackAWithQuantRowOffset<T, accT>, T, accT> {
+ public:
+  using This = PackAWithQuantRowOffset<T, accT>;
+  using BaseType = PackMatrix<This, T, accT>;
+  using inpType = T;
+  using accType = accT;
+  PackAWithQuantRowOffset() = delete; // no default constructor
+  /**
+   * @param row_offset If nullptr, this constructor internally allocates a
+   *                   buffer and owns it. Otherwise, this class doesn't own
+   *                   the buffer. The buffer will be populated when pack
+   *                   function is called.
+   */
+  PackAWithQuantRowOffset(
+      matrix_op_t trans,
+      std::int32_t nRow,
+      std::int32_t nCol,
+      const float* smat,
+      std::int32_t ld,
+      inpType* pmat = nullptr,
+      float scale = 1.0f,
+      std::int32_t zero_pt = 0,
+      int groups = 1,
+      std::int32_t* row_offset = nullptr,
+      const BlockingFactors* params = nullptr);
+  PackAWithQuantRowOffset(const PackAWithQuantRowOffset&) = delete;
+  PackAWithQuantRowOffset(PackAWithQuantRowOffset&&) = delete;
+  PackAWithQuantRowOffset& operator=(const PackAWithQuantRowOffset&) = delete;
+  PackAWithQuantRowOffset& operator=(PackAWithQuantRowOffset&&) = delete;
+  /**
+   * Activation matrices are not constant so cannot amortize the cost of
+   * pre-packing.
+   */
+  bool isPrePacked() const {
+    return false;
+  }
+  /**
+   * @return True if this is used as A matrix.
+   */
+  static constexpr bool isA() {
+    return true;
+  }
+  /**
+   * @return offset of the element in the packed matrix that was at (i, j) in
+   *         the source matrix
+   */
+  std::int32_t addr(std::int32_t i, std::int32_t j) const;
+  /**
+   * @brief Packs a block of source matrix into pmat buffer.
+   */
+  void pack(const block_type_t& block);
+  /**
+   * @return A pointer to the row offset buffer.
+   */
+  std::int32_t* getRowOffsetBuffer() const {
+    return row_offset_;
+  }
+  /**
+   * @brief Print the packed block.
+   */
+  void printPackedMatrix(const std::string& name);
+  /**
+   * @return Size of row offset buffer in number of elements
+   */
+  static int rowOffsetBufferSize(const BlockingFactors* params = nullptr);
+  ~PackAWithQuantRowOffset() override {
+    if (rowOffsetAllocatedHere) {
+      fbgemmAlignedFree(row_offset_);
+    }
+  }
+ private:
+  matrix_op_t trans_;
+  const float* smat_;
+  std::int32_t ld_;
+  float scale_;
+  std::int32_t zero_pt_;
+  std::int32_t* row_offset_{nullptr};
+  bool rowOffsetAllocatedHere{false};
+  std::int32_t row_interleave_B_;
+};
+/*
+ *
+ * Post Processing of outputs
+ *
+ */
+/**
+ * @brief Does nothing. NoOp. Used as the last operation in the output
+ *        processing pipeline.
+ *
+ */
+template <typename outT = std::uint8_t, typename inT = std::uint8_t>
+class FBGEMM_API DoNothing {
+ public:
+  using outType = outT;
+  using inpType = inT;
+  DoNothing() = default;
+  template <inst_set_t instSet>
+  int f(
+      outType* /* unused */,
+      inpType* /* unused */,
+      const block_type_t& /* unused */,
+      int /* unused */,
+      int /* unused */) const {
+    return 0;
+  }
+};
+/**
+ * @brief Copy data pointed by inp ptr to out ptr when
+ *        inp ptr and out ptr are not the same.
+ *        inp buffer: row and column start points: (0, 0)
+ *        output buffer: row and column start points:
+ *        (block.row_start, block.col_start)
+ *
+ * This is the output processing stage that should passed when there is no
+ * requantization and output is required in the same format as internal buffer
+ * used for accumulation.
+ */
+template <
+    typename outT = std::int32_t,
+    typename inT = std::int32_t,
+    typename nextOPType = DoNothing<outT, outT>>
+class FBGEMM_API memCopy {
+ public:
+  using outType = outT;
+  using inpType = inT;
+  explicit memCopy(nextOPType& nextop) : nextop_(nextop) {}
+  template <inst_set_t instSet>
+  inline int f(
+      outType* out,
+      inpType* inp,
+      const block_type_t& block,
+      int ld_out,
+      int ld_in) const;
+ private:
+  nextOPType& nextop_;
+};
+/**
+ * @brief Perform scaling on accumulated data.
+ */
+template <
+    typename outT = std::int32_t,
+    typename inT = std::int32_t,
+    typename nextOPType = DoNothing<outT, outT>>
+class ScaleOP {
+ public:
+  using outType = outT;
+  using inpType = inT;
+  explicit ScaleOP(inpType scalingFactor) : scalingFactor_(scalingFactor) {}
+  template <inst_set_t instSet>
+  inline int f(
+      outType* out,
+      inpType* inp,
+      const block_type_t& block,
+      int ld_out,
+      int ld_in) const;
+ private:
+  inpType scalingFactor_;
+};
+/**
+ * @brief Perform Relu on accumulated data.
+ */
+template <
+    typename outT = std::int32_t,
+    typename inT = std::int32_t,
+    typename nextOPType = DoNothing<outT, outT>>
+class ReluOutput {
+ public:
+  using outType = outT;
+  using inpType = inT;
+  explicit ReluOutput(inpType zero_pt) : zero_pt_(zero_pt) {}
+  template <inst_set_t instSet>
+  inline int f(
+      outType* out,
+      inpType* inp,
+      const block_type_t& block,
+      int ld_out,
+      int ld_in) const;
+ private:
+  inpType zero_pt_;
+};
+/**
+ * @brief Perform Dense-Matrix * Sparse-Matrix as a part the of output
+ * processing pipeline.
+ *
+ * SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer
+ * (inp). After modifying the input buffer, pass it to the next op.
+ * When groups > 1, each group is numRows() x (numCols()/groups) matrix.
+ */
+template <
+    typename outT = std::int32_t,
+    typename inT = std::int32_t,
+    typename nextOPType = DoNothing<inT, inT>>
+class FBGEMM_API DoSpmdmOnInpBuffer {
+ public:
+  using outType = outT;
+  using inpType = inT;
+  DoSpmdmOnInpBuffer(
+      nextOPType& nextop,
+      const std::uint8_t* A,
+      int lda,
+      const CompressedSparseColumn& B_csc,
+      int groups = 1)
+      : nextop_(nextop), A_(A), lda_(lda), B_csc_(B_csc), groups_(groups) {}
+  template <inst_set_t instSet>
+  inline int f(
+      outT* out,
+      inT* inp,
+      const block_type_t& block,
+      int ld_out,
+      int ld_in) const;
+ private:
+  nextOPType& nextop_;
+  const std::uint8_t* A_;
+  const int lda_;
+  const CompressedSparseColumn& B_csc_;
+  const int groups_;
+};
+/**
+ * @brief Perform Dense-Matrix * Sparse-Matrix as a part the of output
+ * processing pipeline.
+ *
+ * SPMDM (SParse Matrix times Dense Matrix) inplace on the 32-bit input buffer
+ * (inp). After modifying the input buffer, pass it to the next op.
+ * When groups > 1, each group is numRows() x (numCols()/groups) matrix.
+ */
+template <
+    typename outT = std::int32_t,
+    typename inT = std::int32_t,
+    typename nextOPType = DoNothing<inT, inT>>
+class FBGEMM_API DoSConvOnInpBuffer {
+ public:
+  using outType = outT;
+  using inpType = inT;
+  DoSConvOnInpBuffer(
+      nextOPType& nextop,
+      const std::uint8_t* A,
+      const conv_param_t<>& conv_p,
+      std::int32_t A_zero_point,
+      const CompressedSparseColumn& B_csc)
+      : nextop_(nextop),
+        A_(A),
+        conv_p_(conv_p),
+        A_zero_point_(A_zero_point),
+        B_csc_(B_csc) {}
+  template <inst_set_t instSet>
+  inline int f(
+      outT* out,
+      inT* inp,
+      const block_type_t& block,
+      int ld_out,
+      int ld_in) const;
+ private:
+  nextOPType& nextop_;
+  const std::uint8_t* A_;
+  const conv_param_t<> conv_p_;
+  const std::int32_t A_zero_point_;
+  const CompressedSparseColumn& B_csc_;
+};
+/**
+ * @brief Requantize values in inp buffer and write to out buffer.
+ *        pass the out buffer to next op for further processing.
+ */
+template <
+    bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
+    typename BIAS_TYPE = std::int32_t,
+    typename outT = std::uint8_t,
+    typename inT = std::int32_t,
+    typename nextOPType = DoNothing<outT, outT>>
+class FBGEMM_API ReQuantizeOutput {
+ public:
+  static constexpr int RELU_FUSED = FUSE_RELU;
+  static constexpr QuantizationGranularity QGRANType = Q_GRAN;
+  using BIAS_T = BIAS_TYPE;
+  using outType = outT;
+  using inpType = inT;
+  /**
+   * @param C_multiplier The length of this array is
+   *                     1 when Q_GRAN == QuantizationGranularity::TENSOR,
+   *                     groups when Q_GRAN == QuantizationGranularity::GROUP,
+   *                     nCol if Q_GRAN == QuantizationGranularity::OUT_CHANNEL
+   * @param Bq_zero_point The length of this array should be the same as
+   *                      C_multiplier.
+   * @param row_offsets Typically, this should've been computed by a
+   *                    PackAMatrix and should be obtained by
+   *                    PackMatrix::getRowOffsetBuffer().
+   *                    If Bq_zero_point == 0 (symmetric quantization of B
+   *                    matrix), we can pass nullptr.
+   * @param col_offsets This should be pre-computed for example using
+   *                    col_offsets_with_zero_pt_s8acc32_ref.
+   *                    The length should be nCol.
+   *                    See PackedRequantizeTest.cc for an example.
+   *                    TODO: if Aq_zero_point == 0, allow passing nullptr.
+   * @param bias can be nullptr otherwise the length should be nCol
+   * @param act_times_w_scale activation_scale * weight_scale. This is only
+   *                          used if bias is unquantized (i.e., float).
+   */
+  ReQuantizeOutput(
+      nextOPType& nextop,
+      const float* C_multiplier,
+      std::int32_t C_zero_point,
+      std::int32_t Aq_zero_point,
+      const std::int32_t* Bq_zero_point,
+      const std::int32_t* row_offsets,
+      const std::int32_t* col_offsets,
+      const BIAS_T* bias,
+      std::uint32_t nCol,
+      int groups = 1,
+      const float* act_times_w_scale = nullptr)
+      : nextop_(nextop),
+        C_multiplier_(C_multiplier),
+        C_zero_point_(C_zero_point),
+        Aq_zero_point_(Aq_zero_point),
+        Bq_zero_point_(Bq_zero_point),
+        q_row_offsets_(row_offsets),
+        q_col_offsets_(col_offsets),
+        bias_(bias),
+        ncols_(nCol),
+        groups_(groups),
+        act_times_w_scale_(act_times_w_scale) {}
+  template <inst_set_t instSet>
+  inline int f(
+      outT* out,
+      const inT* inp,
+      const block_type_t& block,
+      int ld_out,
+      int ld_in) const;
+  const float* getCMultiplier() const {
+    return C_multiplier_;
+  }
+  std::int32_t getAZeroPoint() const {
+    return Aq_zero_point_;
+  }
+  std::int32_t getCZeroPoint() const {
+    return C_zero_point_;
+  }
+  const std::int32_t* getBZeroPoint() const {
+    return Bq_zero_point_;
+  }
+  const std::int32_t* getRowOffsets() const {
+    return q_row_offsets_;
+  }
+  const std::int32_t* getColOffsets() const {
+    return q_col_offsets_;
+  }
+  const BIAS_T* getBias() const {
+    return bias_;
+  }
+  std::uint32_t getNCols() const {
+    return ncols_;
+  }
+  const float* getActWScale() const {
+    return act_times_w_scale_;
+  }
+  void setRowOffsets(const std::int32_t* row_offsets) {
+    q_row_offsets_ = row_offsets;
+  }
+ private:
+  nextOPType& nextop_;
+  const float* C_multiplier_;
+  std::int32_t C_zero_point_;
+  std::int32_t Aq_zero_point_;
+  const std::int32_t* Bq_zero_point_;
+  const std::int32_t* q_row_offsets_;
+  const std::int32_t* q_col_offsets_;
+  const BIAS_T* bias_;
+  std::uint32_t ncols_;
+  int groups_;
+  const float* act_times_w_scale_;
+};
+/**
+ * @brief Requantize to convert accumulated data to be used as float, i.e., the
+ *        output would be used as float.
+ */
+template <
+    bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN = QuantizationGranularity::TENSOR,
+    typename outT = float,
+    typename inT = std::int32_t,
+    typename nextOPType = DoNothing<outT, outT>>
+class FBGEMM_API ReQuantizeForFloat {
+ public:
+  using outType = outT;
+  using inpType = inT;
+  /**
+   * @param Bq_scale The length of this array is
+   *                 1 when Q_GRAN == QuantizationGranularity::TENSOR,
+   *                 groups when Q_GRAN == QuantizationGranularity::GROUP,
+   *                 nCol if Q_GRAN == QuantizationGranularity::OUT_CHANNEL
+   * @param Bq_zero_point The length of this array should be the same as
+   *                      Bq_scale.
+   * @param row_offsets Typically, this should've been computed by a
+   *                    PackAMatrix and should be obtained by
+   *                    PackMatrix::getRowOffsetBuffer().
+   *                    If Bq_zero_point == 0 (symmetric quantization of B
+   *                    matrix), we can pass nullptr.
+   * @param col_offsets This should be pre-computed for example using
+   *                    col_offsets_with_zero_pt_s8acc32_ref.
+   *                    The length should be nCol.
+   *                    See PackedRequantizeTest.cc for an example.
+   *                    TODO: if Aq_zero_point == 0, allow passing nullptr.
+   * @param bias can be nullptr otherwise the length should be nCol
+   */
+  ReQuantizeForFloat(
+      nextOPType& nextop,
+      float Aq_scale,
+      const float* Bq_scale,
+      std::int32_t Aq_zero_point,
+      const std::int32_t* Bq_zero_point,
+      const std::int32_t* row_offsets,
+      const std::int32_t* col_offsets,
+      const float* bias,
+      std::uint32_t nCol,
+      int groups = 1)
+      : nextop_(nextop),
+        Aq_scale_(Aq_scale),
+        Bq_scale_(Bq_scale),
+        Aq_zero_point_(Aq_zero_point),
+        Bq_zero_point_(Bq_zero_point),
+        q_row_offsets_(row_offsets),
+        q_col_offsets_(col_offsets),
+        bias_(bias),
+        ncols_(nCol),
+        groups_(groups) {}
+  template <inst_set_t instSet>
+  inline int f(
+      outT* out,
+      inT* inp,
+      const block_type_t& block,
+      int ld_out,
+      int ld_in) const;
+ private:
+  nextOPType& nextop_;
+  float Aq_scale_;
+  const float* Bq_scale_;
+  std::int32_t Aq_zero_point_;
+  const std::int32_t* Bq_zero_point_;
+  const std::int32_t* q_row_offsets_;
+  const std::int32_t* q_col_offsets_;
+  const float* bias_;
+  std::uint32_t ncols_;
+  int groups_;
+};
+// type specialized implementation in an include file
+#include "./OutputProcessing-inl.h" // @manual
+/*
+ *
+ * ####### GEMM related functions #######
+ *
+ */
+/**
+ * Matrix B must be prepacked. For matrix A, packA.pack function is called to
+ * pack it.
+ *
+ * @tparam packingAMatrix processing of A matrix while packing,
+ *                        e.g., PackAWithQuantRowOffset
+ *
+ * @tparam packingBMatrix processing of B matrix while packing,
+ *                        e.g.,  pre-multiply by alpha
+ * @tparam cT data type of C matrix
+ * @tparam processOutputType further processing of outputs, e.g., Relu
+ */
+template <
+    typename packingAMatrix,
+    typename packingBMatrix,
+    typename cT,
+    typename processOutputType>
+FBGEMM_API void fbgemmPacked(
+    PackMatrix<
+        packingAMatrix,
+        typename packingAMatrix::inpType,
+        typename packingAMatrix::accType>& packA,
+    PackMatrix<
+        packingBMatrix,
+        typename packingBMatrix::inpType,
+        typename packingBMatrix::accType>& packB,
+    cT* C,
+    std::int32_t* C_buffer,
+    std::uint32_t ldc,
+    const processOutputType& outProcess,
+    int thread_id,
+    int num_threads,
+    const BlockingFactors* blocking_params = nullptr);
+/**
+ * @brief Perform small-channels-per-group groupwise convolution
+ *        Note: Currently threading is not supported. This function does
+ *              nothing for thread_ids > 0, i.e., returns early.
+ *
+ * @param rowOffsetBuf nullptr if B uses symmetric quantization
+ *        Note: Currently threading is not supported. This function does
+ *              nothing for thread_ids > 0, i.e., returns early.
+ */
+template <
+    typename packed_W,
+    typename outType,
+    bool FUSE_RELU,
+    QuantizationGranularity Q_GRAN,
+    int SPATIAL_DIM = 2,
+    typename BIAS_TYPE = std::int32_t>
+FBGEMM_API void fbgemmGroupwiseConv(
+    const conv_param_t<SPATIAL_DIM>& conv_param,
+    const std::uint8_t* activations,
+    std::int32_t a_zero_point,
+    std::int32_t* rowOffsetBuf,
+    packed_W& packed_weights,
+    outType* out,
+    std::int32_t* outBuffer,
+    const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess,
+    int thread_id,
+    int num_threads);
+template <
+    int SPATIAL_DIM,
+    QuantizationGranularity Q_GRAN,
+    bool FUSE_RELU,
+    typename BIAS_TYPE = std::int32_t>
+FBGEMM_API void fbgemmDirectConv(
+    const conv_param_t<SPATIAL_DIM>& conv_p,
+    const uint8_t* Aint8,
+    PackedDirectConvMatrix& Bint8_tr,
+    uint8_t* C,
+    int32_t* C_buffer,
+    const ReQuantizeOutput<FUSE_RELU, Q_GRAN, BIAS_TYPE>& outProcess,
+    const BIAS_TYPE* bias,
+    int thread_id,
+    int num_threads);
+/**
+ * @return Size of row offset buffer in number of elements needed for
+ * fbgemmGroupwiseConv
+ */
+template <int SPATIAL_DIM = 2>
+FBGEMM_API int rowOffsetBufferSizeGConv(
+    const conv_param_t<SPATIAL_DIM>& conv_param);
+/**
+ * @brief Is this depthwise convolution optimized?
+ */
+template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
+bool takeDepthWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
+/**
+ * @brief Is this groupwise convolution supported?
+ */
+template <int SPATIAL_DIM>
+FBGEMM_API bool fbgemmOptimizedGConv(const conv_param_t<SPATIAL_DIM>& conv_p);
+/**
+ * @brief Is this convolution a direct matrix-matrix multiplication, i.e., 1x1
+ * (aka pointwise) with right paddings etc.?
+ */
+template <int SPATIAL_DIM>
+FBGEMM_API bool takePointWiseFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
+/**
+ * @brief Are we running on a fbgemm supported cpu?
+ */
+FBGEMM_API bool fbgemmSupportedCPU();
+/**
+ * @brief Performs convolution using fastest path available.
+ *
+ * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions.
+ */
+template <
+    typename processOutputType,
+    int SPATIAL_DIM = 2,
+    typename ACC_T = std::int32_t>
+FBGEMM_API int fbgemmConv(
+    const conv_param_t<SPATIAL_DIM>& conv_p,
+    const std::uint8_t* activations,
+    PackWeightsForConv<SPATIAL_DIM, std::int8_t, ACC_T>& packed_weights,
+    typename processOutputType::outType* out,
+    std::int32_t* outBuffer,
+    processOutputType& outProcess,
+    int thread_id,
+    int num_threads,
+    const BlockingFactors* blocking_params = nullptr);
+/**
+ * @brief Returns which fast path to take
+ *
+ * @tparam SPATIAL_DIM It's 2 for 2D convolutions and 3 for 3D convolutions.
+ *
+ * @return optimized_conv_t::depthwise, optimized_conv_t::groupwise or
+ *         optimized_conv_t::im2col
+ *
+ */
+template <int SPATIAL_DIM = 2, typename ACC_T = std::int32_t>
+FBGEMM_API optimized_conv_t
+ConvFastPath(const conv_param_t<SPATIAL_DIM>& conv_p);
+} // namespace fbgemm
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmBuild.h ADDED Viewed

	@@ -0,0 +1,116 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+// For details about dllexport/dllimport, checkout the following SO question
+// https://stackoverflow.com/questions/57999/what-is-the-difference-between-dllexport-and-dllimport
+#if !defined(FBGEMM_API)
+#if defined(FBGEMM_STATIC)
+#define FBGEMM_API
+#define FBGEMM_ENUM_CLASS_API
+#elif defined _WIN32 || defined __CYGWIN__
+#if (__GNUC__ || __clang__) && !(__MINGW64__ || __MINGW32__)
+#if defined(FBGEMM_EXPORTS)
+#define FBGEMM_API __attribute__((__dllexport__))
+#else
+#define FBGEMM_API __attribute__((__dllimport__))
+#endif
+#else
+#if defined(FBGEMM_EXPORTS)
+#define FBGEMM_API __declspec(dllexport)
+#else
+#define FBGEMM_API __declspec(dllimport)
+#endif
+#endif
+#define FBGEMM_ENUM_CLASS_API
+#else
+#if __clang__ || __GNUC__ || __INTEL_COMPILER
+#define FBGEMM_API __attribute__((__visibility__("default")))
+#else
+#define FBGEMM_API
+#endif
+// Currently, enum classes need to be declaredly explicitly for shared build on
+// macos
+#if __clang__
+#define FBGEMM_ENUM_CLASS_API __attribute__((__visibility__("default")))
+#else
+#define FBGEMM_ENUM_CLASS_API
+#endif
+#endif
+#endif
+// Use this to indicate to not inline functions
+#if __clang__ || __GNUC__ || __INTEL_COMPILER
+#define NOINLINE __attribute__((noinline))
+#elif _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE
+#endif
+// Use this to indicate always inline functions
+#if __clang__ || __GNUC__ || __INTEL_COMPILER
+#define ALWAYS_INLINE inline __attribute__((__always_inline__))
+#elif _MSC_VER
+// commenting out because __forceinline takes too long time in MSVC
+#define ALWAYS_INLINE // __forceinline
+#else
+#define ALWAYS_INLINE inline
+#endif
+// Use the C++11 keyword "alignas" if you can
+#if _MSC_VER
+#define ALIGNAS(byte_alignment) __declspec(align(byte_alignment))
+#else
+#define ALIGNAS(byte_alignment) __attribute__((aligned(byte_alignment)))
+#endif
+// Sanitizers annotations
+#if defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define NO_SANITIZE(what) __attribute__((no_sanitize(what)))
+#endif
+#endif
+#if !defined(NO_SANITIZE)
+#define NO_SANITIZE(what)
+#endif
+// Ignore __builtin_assume() when not supported by compiler.
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+#if !__has_builtin(__builtin_assume)
+#define __builtin_assume(x) (static_cast<void>(0))
+#endif
+// Macro for silencing warnings
+#if __clang__ || __GNUC__
+// clang-format off
+#define FBGEMM_PUSH_WARNING _Pragma("GCC diagnostic push")
+#define FBGEMM_DISABLE_WARNING_INTERNAL2(warningName) #warningName
+#define FBGEMM_DISABLE_WARNING(warningName) \
+  _Pragma(                                     \
+      FBGEMM_DISABLE_WARNING_INTERNAL2(GCC diagnostic ignored warningName))
+#define FBGEMM_PUSH_WARNING_AND_DISABLE(warningName) \
+  _Pragma("GCC diagnostic push") \
+  _Pragma(                                     \
+      FBGEMM_DISABLE_WARNING_INTERNAL2(GCC diagnostic ignored warningName))
+#define FBGEMM_POP_WARNING _Pragma("GCC diagnostic pop")
+// clang-format on
+#else
+#define FBGEMM_PUSH_WARNING
+#define FBGEMM_DISABLE_WARNING(NAME)
+#define FBGEMM_PUSH_WARNING_AND_DISABLE(NAME)
+#define FBGEMM_POP_WARNING
+#endif
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmConvert.h ADDED Viewed

	@@ -0,0 +1,205 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include "fbgemm/FbgemmBuild.h"
+#include "fbgemm/Types.h"
+namespace fbgemm {
+/**
+ * @ Transform all entries in a matrix from fp32 to bfloat16: reference
+ * implementation.
+ *
+ */
+FBGEMM_API void
+FloatToBfloat16_ref(const float* src, bfloat16* dst, size_t size);
+/**
+ * @ Transform all entries in a matrix from bfloat16 to fp32: reference
+ * implementation.
+ *
+ */
+FBGEMM_API void
+Bfloat16ToFloat_ref(const bfloat16* src, float* dst, size_t size);
+/**
+ * @ Transform all entries in a matrix from fp32 to bfloat16: simd
+ * implementation.
+ *
+ */
+FBGEMM_API void
+FloatToBfloat16_simd(const float* src, bfloat16* dst, size_t size);
+/**
+ * @ Transform all entries in a matrix from bfloat16 to fp32: simd
+ * implementation.
+ *
+ */
+FBGEMM_API void
+Bfloat16ToFloat_simd(const bfloat16* src, float* dst, size_t size);
+#if !defined(__aarch64__)
+/**
+ * @brief AVX2 implementation to convert fp32 numbers to bf16 numbers.
+ *
+ */
+FBGEMM_API void
+FloatToBfloat16_avx2(const float* src, bfloat16* dst, size_t size);
+/**
+ * @brief AVX512 implementation to convert fp32 numbers to bf16 numbers.
+ *
+ */
+FBGEMM_API void
+FloatToBfloat16_avx512(const float* src, bfloat16* dst, size_t size);
+/**
+ * @brief AVX2 implementation to convert bf16 numbers to fp32 numbers.
+ *
+ */
+FBGEMM_API void
+Bfloat16ToFloat_avx2(const bfloat16* src, float* dst, size_t size);
+/**
+ * @brief AVX512 implementation to convert bf16 numbers to fp32 numbers.
+ *
+ */
+FBGEMM_API void
+Bfloat16ToFloat_avx512(const bfloat16* src, float* dst, size_t size);
+#endif
+/**
+ * @ Transform all entries in a matrix from fp32 to float16: reference
+ * implementation.
+ *
+ * @param do_clip if true we saturate to fp16 min and max instead of generating
+ *                infinities.
+ */
+FBGEMM_API void FloatToFloat16_ref(
+    const float* src,
+    float16* dst,
+    size_t size,
+    bool do_clip = false);
+/**
+ * @ Transform all entries in a matrix from float16 to fp32: reference
+ * implementation.
+ *
+ */
+FBGEMM_API void Float16ToFloat_ref(const float16* src, float* dst, size_t size);
+/**
+ * @ Transform all entries in a matrix from fp32 to float16: simd
+ * implementation.
+ *
+ * @param do_clip if true we saturate to fp16 min and max instead of generating
+ *                infinities.
+ */
+FBGEMM_API void FloatToFloat16_simd(
+    const float* src,
+    float16* dst,
+    size_t size,
+    bool do_clip = false);
+/**
+ * @ Transform all entries in a matrix from float16 to fp32: simd
+ * implementation.
+ *
+ */
+FBGEMM_API void
+Float16ToFloat_simd(const float16* src, float* dst, size_t size);
+/**
+ * @brief AVX2 implementation to convert fp32 numbers to fp16 numbers.
+ *
+ */
+#if !defined(__aarch64__)
+FBGEMM_API void FloatToFloat16_avx2(
+    const float* src,
+    float16* dst,
+    size_t size,
+    bool do_clip = false);
+/**
+ * @brief AVX512 implementation to convert fp32 numbers to fp16 numbers.
+ *
+ */
+FBGEMM_API void FloatToFloat16_avx512(
+    const float* src,
+    float16* dst,
+    size_t size,
+    bool do_clip = false);
+#endif
+/**
+ * @brief SVE2 implementation to convert fp32 numbers to fp16 numbers.
+ *
+ */
+FBGEMM_API void FloatToFloat16_sve2(
+    const float* src,
+    float16* dst,
+    size_t size,
+    bool do_clip = false);
+#if !defined(__aarch64__)
+/**
+ * @brief AVX2 implementation to convert fp16 numbers to fp32 numbers.
+ *
+ */
+FBGEMM_API void
+Float16ToFloat_avx2(const float16* src, float* dst, size_t size);
+/**
+ * @brief AVX512 implementation to convert fp16 numbers to fp32 numbers.
+ *
+ */
+FBGEMM_API void
+Float16ToFloat_avx512(const float16* src, float* dst, size_t size);
+#endif
+/**
+ * @brief Transform all entries in a matrix from fp32 to float16 and back to
+ * fp32.
+ */
+FBGEMM_API void RoundToFloat16(
+    const float* input,
+    float* output,
+    size_t size,
+    bool clamp = false,
+    bool clamp_denorms = false);
+/**
+ * @brief Quantize float32 to float8. The code is a copy of float_to_hfp8() in
+ * fbgemm_gpu/quantize_ops_utils.h
+ */
+FBGEMM_API void FloatToFloat8_ref(
+    float input,
+    uint8_t* output,
+    int exponent_bits,
+    int exponent_bias);
+/**
+ * @brief Dequantize float8 to float32. The code is a copy of hf8_to_float() in
+ * fbgemm_gpu/quantize_ops_utils.h
+ */
+FBGEMM_API void Float8ToFloat_ref(
+    uint8_t input,
+    float* output,
+    int exponent_bits,
+    int exponent_bias);
+} // namespace fbgemm
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmEmbedding.h ADDED Viewed

	@@ -0,0 +1,383 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <cstdint>
+#include <functional>
+#include "fbgemm/FbgemmBuild.h"
+namespace fbgemm {
+template <
+    typename InType,
+    typename IndexType,
+    typename OffsetType = std::int32_t,
+    typename OutType = float>
+class EmbeddingSpMDMKernelSignature {
+ public:
+  /**
+   * Behavior is as the follow pseudocode
+   * (when use_offsets == true, lengths[i] == offsets[i + 1] - offsets[i])
+   * (when is_weight_positional == true, use weights[j - offsets[i]] instead of
+   *  weights[j])
+   *
+   * for i in range(output_size):
+   *  out[i * block_size : (i + 1) * block_size] = 0
+   *  for j in range(offsets[i], offsets[i + 1]):
+   *   for k in range(block_size):
+   *    out[i * block_size + k] += input[indices[j] * block_size + k] *
+   *                               weights ? weights[j] : 1;
+   *  if normalize_weights and lengths[i] > 0:
+   *   out[i * block_size : (i + 1) * block_size] /= lengths[i]
+   *
+   * @param data_size the number of rows in embedding table
+   */
+  using Type = std::function<bool(
+      std::int64_t output_size,
+      std::int64_t index_size,
+      std::int64_t data_size,
+      const InType* input,
+      const IndexType* indices,
+      const OffsetType* offsets_or_lengths,
+      const float* weights, // optional, can be null for non-weighted sum
+      OutType* out)>;
+};
+/**
+ * @tparam InType can be float, float16, or uint8_t
+ * @tparam IndexType can be int32_t or int64_t
+ * @tparam IndexType can be int32_t or int64_t
+ *
+ * @param use_offsets If true, the generated code assumes we will pass offsets
+ *                    instead of lengths that confirms PyTorch EmbeddingBag
+ *                    interface. In this case, the length of offsets array
+ *                    should be output_size + 1 and offsets[output_size] should
+ *                    be index_size.
+ *                    If false, the generate code assumes we will pass lengths
+ *                    that confirms Caffe2 SparseLengthsSum interface.
+ */
+template <
+    typename InType,
+    typename IndexType,
+    typename OffsetType = std::int32_t,
+    typename OutType = float,
+    bool THREAD_LOCAL = false>
+FBGEMM_API typename EmbeddingSpMDMKernelSignature<
+    InType,
+    IndexType,
+    OffsetType,
+    OutType>::Type
+GenerateEmbeddingSpMDM(
+    const std::int64_t block_size,
+    bool has_weight,
+    bool normalize_by_lengths,
+    int prefetch = 16,
+    bool is_weight_positional = false,
+    bool use_offsets = true,
+    bool is_bf16_out = false,
+    bool is_bf16_in = false);
+/**
+ * @param output_stride If -1, output_stride is same as block_size
+ * @param input_stride If -1, input_stride is same as block_size
+ * @param scale_bias_last if false, scale and bias appear at the beginning
+ *        of each row and are in fp16 for table batched embedding (TBE)
+ *        in FBGEMM_GPU. If false, it can also take -1 indices (output from
+ *        pruned embedding id mapping)
+ */
+template <
+    typename InType,
+    typename IndexType,
+    typename OffsetType = std::int32_t,
+    typename OutType = float,
+    bool THREAD_LOCAL = false>
+FBGEMM_API typename EmbeddingSpMDMKernelSignature<
+    InType,
+    IndexType,
+    OffsetType,
+    OutType>::Type
+GenerateEmbeddingSpMDMWithStrides(
+    const std::int64_t block_size,
+    bool has_weight,
+    bool normalize_by_lengths,
+    int prefetch = 16,
+    bool is_weight_positional = false,
+    bool use_offsets = true,
+    std::int64_t output_stride = -1,
+    std::int64_t input_stride = -1,
+    bool scale_bias_last = true,
+    bool no_bag = false,
+    bool is_bf16_out = false,
+    bool is_bf16_in = false);
+/**
+ * @tparam IndexType can be int32_t or int64_t
+ * @tparam OffsetType can be int32_t or int64_t
+ * @param bit_rate can be 2 or 4
+ */
+template <
+    typename IndexType,
+    typename OffsetType = std::int32_t,
+    typename OutType = float>
+FBGEMM_API typename EmbeddingSpMDMKernelSignature<
+    std::uint8_t,
+    IndexType,
+    OffsetType,
+    OutType>::Type
+GenerateEmbeddingSpMDMNBit(
+    int bit_rate,
+    const std::int64_t block_size,
+    bool has_weight,
+    bool normalize_by_lengths,
+    int prefetch = 16,
+    bool is_weight_positional = false,
+    bool use_offsets = true);
+/**
+ * @param output_stride If -1, output_stride is same as block_size
+ * @param input_stride in Bytes. If -1, input_stride is same as
+ *                     block_size / num_elem_per_byte + 2 * sizeof(float16)
+ * @param scale_bias_last if false, scale and bias appear at the beginning
+ *        of each row and are in fp16 for table batched embedding (TBE)
+ *        in FBGEMM_GPU. If false, it can also take -1 indices (output from
+ *        pruned embedding id mapping)
+ */
+template <
+    typename IndexType,
+    typename OffsetType = std::int32_t,
+    typename OutType = float,
+    bool THREAD_LOCAL = false>
+FBGEMM_API typename EmbeddingSpMDMKernelSignature<
+    std::uint8_t,
+    IndexType,
+    OffsetType,
+    OutType>::Type
+GenerateEmbeddingSpMDMNBitWithStrides(
+    const int input_bit_rate,
+    const std::int64_t block_size,
+    bool has_weight,
+    bool normalize_by_lengths,
+    int prefetch = 16,
+    bool is_weight_positional = false,
+    bool use_offsets = true,
+    std::int64_t output_stride = -1,
+    std::int64_t input_stride = -1,
+    bool scale_bias_last = true,
+    const bool is_bf16_out = false,
+    const bool no_bag = false,
+    int output_bit_rate = -1);
+/**
+ * @param output_stride If -1, output_stride is same as block_size
+ * @param input_stride in Bytes. If -1, input_stride is same as
+ *                     block_size / num_elem_per_byte + 2 * sizeof(float16)
+ * @param exponent_bits is the number of exponent bits in the FP8 encode
+ *                      (normally 4 or 5)
+ * @param exponent_bias is subtracted from the exponent to obtain the actual
+ *                      exponent for the floating-point number
+ */
+template <
+    typename IndexType,
+    typename OffsetType = std::int32_t,
+    typename OutType = float>
+FBGEMM_API typename EmbeddingSpMDMKernelSignature<
+    std::uint8_t,
+    IndexType,
+    OffsetType,
+    OutType>::Type
+GenerateEmbeddingSpMDMFP8WithStrides(
+    const std::int64_t block_size,
+    bool normalize_by_lengths,
+    bool is_weight_positional = false,
+    bool use_offsets = true,
+    std::int64_t output_stride = -1,
+    std::int64_t input_stride = -1,
+    int exponent_bits = 4,
+    int exponent_bias = 7,
+    bool is_bf16_out = false);
+template <
+    typename InType,
+    typename IndexType,
+    typename OffsetType = std::int32_t>
+class EmbeddingSpMDMRowWiseSparseKernelSignature {
+ public:
+  using Type = std::function<bool(
+      std::int64_t output_size,
+      std::int64_t index_size,
+      std::int64_t uncompressed_data_size,
+      // TODO: add compressed_data_size and check array bound
+      const InType* input,
+      const IndexType* indices,
+      const OffsetType* offsets_or_lengths,
+      const float* weights, // optional, can be null for non-weighted sum
+      float* out,
+      const std::int32_t* compressed_indices_table)>;
+};
+/**
+ * @tparam InType can be float, float16, or uint8_t
+ * @tparam IndexType can be int32_t or int64_t
+ * @tparam OffsetType can be int32_t or int64_t
+ */
+template <
+    typename InType,
+    typename IndexType,
+    typename OffsetType = std::int32_t>
+FBGEMM_API typename EmbeddingSpMDMRowWiseSparseKernelSignature<
+    InType,
+    IndexType,
+    OffsetType>::Type
+GenerateEmbeddingSpMDMRowWiseSparse(
+    const std::int64_t block_size,
+    bool has_weight,
+    bool normalize_by_lengths,
+    int prefetch = 16,
+    bool is_weight_positional = false,
+    bool use_offsets = true);
+/**
+ * @tparam IndexType can be int32_t or int64_t
+ * @tparam OffsetType can be int32_t or int64_t
+ * @param bit_rate can be 2 or 4
+ */
+template <typename IndexType, typename OffsetType = std::int32_t>
+FBGEMM_API typename EmbeddingSpMDMRowWiseSparseKernelSignature<
+    std::uint8_t,
+    IndexType,
+    OffsetType>::Type
+GenerateEmbeddingSpMDMNBitRowWiseSparse(
+    int bit_rate,
+    const std::int64_t block_size,
+    bool has_weight,
+    bool normalize_by_lengths,
+    int prefetch = 16,
+    bool is_weight_positional = false,
+    bool use_offsets = true);
+/**
+ * @return The number of rows processed. If smaller than num_rows, an error
+ *         must have happened at the last row processed.
+ */
+template <typename IndexType>
+class SparseAdaGradSignature {
+ public:
+  using Type = std::function<int(
+      int num_rows, // number of rows reading
+      std::uint64_t param_size, // total number of parameters
+      float* w, // input/output parameters
+      const float* g, // input gradients
+      float* h, // input/output momentums
+      const IndexType* indices, // indices of each row
+      float epsilon,
+      float lr,
+      float weight_decay,
+      const double* counter, // used for weight_decay adjusted for frequency
+                             // nullptr when frequency adjustment is not used.
+                             // ignored when the kernel is generated with
+                             // use_weight_decay = false.
+      std::int64_t counter_halflife)>; // frequency adjust happens only after
+};
+template <typename IndexType>
+FBGEMM_API typename SparseAdaGradSignature<IndexType>::Type
+GenerateSparseAdaGrad(
+    int block_size, // number of parameters per row
+    bool rowwise = false,
+    int prefetch = 16,
+    bool use_weight_decay = false);
+// RowWiseSparseAdaGrad fused with SLS gradient
+// Weights can be either float or float16
+template <
+    typename IndexType,
+    typename OffsetType = std::int32_t,
+    typename DataType = float>
+class RowWiseSparseAdaGradFusedSignature {
+ public:
+  using Type = std::function<bool(
+      std::int64_t output_size,
+      std::int64_t index_size,
+      std::int64_t data_size, // number of rows in w
+      DataType* w, // input/output parameters
+      const float* g, // input gradients
+      float* h, // input/output momentums
+      const IndexType* indices, // indices of each row
+      const OffsetType* offsets_or_lengths,
+      float epsilon,
+      float lr)>;
+};
+/**
+ * @param grad_stride If -1, grad_stride is same as block size
+ */
+template <
+    typename IndexType,
+    typename OffsetType = std::int32_t,
+    typename DataType = float>
+FBGEMM_API typename RowWiseSparseAdaGradFusedSignature<
+    IndexType,
+    OffsetType,
+    DataType>::Type
+GenerateRowWiseSparseAdaGradFused(
+    int block_size, // number of parameters per row
+    int prefetch = 16,
+    bool use_offsets = true,
+    bool use_stochastic_rounding = true,
+    int grad_stride = -1);
+namespace internal {
+// Specialization for block size 1 internally called by GenerateEmbeddingSpMDM
+template <typename InType, typename IndexType, typename OffsetType>
+FBGEMM_API bool EmbeddingSpMDMBlockSize1_(
+    const std::int64_t output_size,
+    const std::int64_t index_size,
+    const std::int64_t data_size, // the number of rows in input
+    const InType* input,
+    const IndexType* indices,
+    const OffsetType* offsets_or_lengths,
+    const float* weights, // optional, can be null for non-weighted sum
+    bool normalize_by_lengths,
+    float* out,
+    bool is_weight_positional = false,
+    bool use_offsets = true,
+    bool is_bf16 = false);
+#if !defined(__aarch64__)
+template <typename IndexType, bool HAS_WEIGHTS>
+void compressed_indices_remap_avx512(
+    std::int32_t offsets_numel,
+    const IndexType* indices,
+    const int32_t* compressed_indices_mapping,
+    const IndexType* offsets,
+    const float* weights, // optional, can be null,
+    IndexType* out_indices,
+    IndexType* out_offsets,
+    float* out_weights);
+#endif
+} // namespace internal
+template <typename IndexType>
+FBGEMM_API void compressed_indices_remap(
+    std::int32_t offsets_numel,
+    const IndexType* indices,
+    const int32_t* compressed_indices_mapping,
+    const IndexType* offsets,
+    const float* weights, // optional, can be null,
+    IndexType* out_indices,
+    IndexType* out_offsets,
+    float* out_weights);
+} // namespace fbgemm
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmFP16.h ADDED Viewed

	@@ -0,0 +1,60 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+// WARNING: this is a legacy fp16 fbgemm implementation and will soon be
+// upgraded to match with new fbgemm interface.
+#include <cpuinfo.h>
+#include "./FbgemmPackMatrixB.h" // @manual
+#include "./FloatConversion.h" // @manual
+#include "./Types.h" // @manual
+#include "./Utils.h" // @manual
+namespace fbgemm {
+template <>
+struct TypeConverter<float16> {
+  float16 operator()(float src) const {
+    constexpr float FP16_MAX = 65504.f;
+    const float fp16 = std::max(-FP16_MAX, std::min(src, FP16_MAX));
+    return cpu_float2half(fp16);
+  }
+};
+using PackedGemmMatrixFP16 = PackedGemmMatrixB<float16>;
+template <typename T>
+FBGEMM_API void cblas_gemm_compute(
+    const matrix_op_t transa,
+    const int m,
+    const float* A,
+    const PackedGemmMatrixB<T>& Bp,
+    const float beta,
+    float* C,
+    int thread_id = 0,
+    int num_threads = 1);
+extern template void cblas_gemm_compute<float16>(
+    const matrix_op_t transa,
+    const int m,
+    const float* A,
+    const PackedGemmMatrixFP16& Bp,
+    const float beta,
+    float* C,
+    int thread_id,
+    int num_threads);
+}; // namespace fbgemm
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmFP32.h ADDED Viewed

	@@ -0,0 +1,54 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+#pragma once
+// WARNING: this is a legacy fp16 fbgemm implementation and will soon be
+// upgraded to match with new fbgemm interface.
+#include <cpuinfo.h>
+#include "fbgemm/FbgemmFPCommon.h"
+#include "fbgemm/FbgemmPackMatrixB.h"
+#include "fbgemm/Utils.h"
+namespace fbgemm {
+template <>
+struct TypeConverter<float> {
+  float operator()(float src) const {
+    return src;
+  }
+};
+using GemmParamsFP32 = GemmParams<float>;
+using PackedGemmMatrixFP32 = PackedGemmMatrixB<float>;
+template <typename T, int _kernel_ncol_blocks, int _brow>
+void cblas_gemm_compute(
+    const matrix_op_t transa,
+    const int m,
+    const float* A,
+    const PackedGemmMatrixB<T>& Bp,
+    const float beta,
+    float* C,
+    int thread_id = 0,
+    int num_threads = 1);
+extern template void cblas_gemm_compute(
+    const matrix_op_t transa,
+    const int m,
+    const float* A,
+    const PackedGemmMatrixFP32& Bp,
+    const float beta,
+    float* C,
+    int thread_id,
+    int num_threads);
+template <>
+const isa_descriptor<float>& getIsaHandlers(inst_set_t isa);
+} // namespace fbgemm
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmFPCommon.h ADDED Viewed

	@@ -0,0 +1,319 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Copyright 2024-2025 Arm Limited and/or its affiliates
+ * <open-source-office@arm.com> All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <fbgemm/FbgemmPackMatrixB.h>
+#include <fbgemm/Types.h>
+#include <fbgemm/Utils.h>
+#include <array>
+#include <memory>
+#if defined(FBGEMM_FP16_FALLBACK_TO_REF_KERNEL) || \
+    defined(FBGEMM_FP32_FALLBACK_TO_REF_KERNEL)
+#if defined(__APPLE__) && defined(__aarch64__)
+#define FBGEMM_USE_REF_KERNEL
+#endif
+#endif
+namespace fbgemm {
+using partition_array_t = std::array<std::array<std::array<int, 2>, 2>, 121>;
+extern partition_array_t partition_avx2;
+extern partition_array_t partition_avx512;
+extern partition_array_t partition_sve128;
+#ifdef FBGEMM_ENABLE_KLEIDIAI
+extern partition_array_t partition_neon;
+#endif
+template <typename T>
+struct GemmParams {
+  uint64_t k;
+  float* A;
+  const T* B;
+  float beta;
+  float* C;
+  uint64_t ldc;
+  uint64_t b_block_cols;
+  uint64_t b_block_size;
+};
+template <>
+struct GemmParams<float16> {
+  uint64_t k;
+  float* A;
+  const float16* B;
+  float beta;
+  float* C;
+  uint64_t ldc;
+  uint64_t b_block_cols;
+#ifdef FBGEMM_ENABLE_KLEIDIAI
+  uint64_t lda;
+#else
+  uint64_t b_block_size;
+#endif
+};
+template <>
+struct GemmParams<float> {
+  uint64_t k;
+  float* A;
+  const float* B;
+  float beta;
+  float* C;
+  uint64_t ldc;
+  uint64_t b_block_cols;
+#ifdef FBGEMM_ENABLE_KLEIDIAI
+  uint64_t lda;
+#else
+  uint64_t b_block_size;
+#endif
+};
+template <typename T>
+using funcptr_t = void (*)(GemmParams<T>*);
+template <typename T>
+using kernel_array_t = std::array<funcptr_t<T>, 15>;
+template <typename T>
+using isa_descriptor = std::tuple<kernel_array_t<T>, partition_array_t>;
+template <typename T>
+extern const isa_descriptor<T>& getIsaHandlers(inst_set_t isa);
+void PackA(int nrow, int ncol, const float* from, int ldim, float* to);
+// define fp16/fp32 kernels using a reference C implementation
+#if defined(FBGEMM_FP16_FALLBACK_TO_REF_KERNEL) || \
+    defined(FBGEMM_FP32_FALLBACK_TO_REF_KERNEL)
+template <typename T>
+FBGEMM_API void ref_kernel(
+    int kernel_nrows,
+    GemmParams<T>* gp,
+    const float* C_base,
+    int m_total,
+    int n_total,
+    int vlen);
+#endif
+template <typename T>
+FBGEMM_API void cblas_gemm_compute(
+    const matrix_op_t transa,
+    const int m,
+    const float* A,
+    const PackedGemmMatrixB<T>& Bp,
+    const float beta,
+    float* C,
+    int thread_id = 0,
+    int num_threads = 1);
+#if defined(FBGEMM_EXPORTS)
+// autotuned kernel splits for various cases m = 1:mb_max
+template <typename T>
+void cblas_gemm_compute(
+    const matrix_op_t transa [[maybe_unused]],
+    const int m,
+    const float* A,
+    const PackedGemmMatrixB<T>& Bp,
+    const float beta,
+    float* C,
+    int thread_id,
+    int num_threads) {
+  // ground truth
+  assert(cpuinfo_initialize());
+#ifndef __aarch64__
+  assert(cpuinfo_has_x86_fma3());
+  assert(cpuinfo_has_x86_f16c());
+#endif
+  assert(transa == matrix_op_t::NoTranspose);
+  // private scratchpad storage
+  static thread_local std::unique_ptr<std::array<float, 256 * 1024>> scratchpad(
+      new std::array<float, 256 * 1024>());
+  // constants
+  const int n = Bp.numCols(), k = Bp.numRows(), ldc = n;
+  const int mb_max = 120;
+#if defined(FBGEMM_USE_REF_KERNEL) && defined(__APPLE__)
+  const auto& [_, partition] = getIsaHandlers<float16>(inst_set_t::sve);
+#else
+  const auto iset = fbgemmInstructionSet();
+  const auto& [kernels, partition] = getIsaHandlers<T>(iset);
+#endif
+#ifdef FBGEMM_USE_REF_KERNEL
+  // By some reason, if packed B is using packing layout for avx2, we just use
+  // avx2 even if avx512 is available.
+  const int simd_width =
+#ifndef __aarch64__
+      (iset == inst_set_t::avx512 || iset == inst_set_t::avx512_vnni) &&
+          (Bp.blockColSize() == 16 * Bp.kernelNumColBlocks())
+      ? simd_info<inst_set_t::avx512>::WIDTH_32BIT_ELEMS
+      : simd_info<inst_set_t::avx2>::WIDTH_32BIT_ELEMS;
+#else
+      simd_info<inst_set_t::sve>::WIDTH_32BIT_ELEMS;
+#endif
+#endif
+  GemmParams<T> gp;
+  int i_begin = 0, i_end = 0;
+  i_begin = 0;
+  i_end = m;
+  for (auto m0 = i_begin; m0 < i_end; m0 += mb_max) {
+    int mb = std::min(mb_max, i_end - m0);
+    assert(mb < static_cast<int64_t>(partition.size()));
+    for (auto k_ind = 0; k_ind < k; k_ind += Bp.blockRowSize()) {
+      // set up proper accumulation to avoid "Nan" problem
+      // accumulate of beta != 0.0
+      // do not!!! accumulate otherwise
+      float beta_ = beta;
+      if (k_ind != 0) {
+        // always accumulate with beta_ = 1.0f
+        beta_ = 1.0f;
+      }
+      const int kb = std::min(Bp.blockRowSize(), Bp.numRows() - k_ind);
+      auto m1 = m0;
+      auto const num_cycles = partition[mb].size();
+      for (size_t c = 0; c < num_cycles; ++c) {
+        auto kernel_nrows = partition[mb][c][0];
+        auto nkernel_nrows = partition[mb][c][1];
+        auto m_start = m1;
+        auto m_end = m1 + kernel_nrows * nkernel_nrows;
+        for (auto m2 = m_start; m2 < m_end; m2 += kernel_nrows) {
+          assert(kernel_nrows * kb < static_cast<int64_t>(scratchpad->size()));
+          if (m != 1) {
+#ifdef FBGEMM_ENABLE_KLEIDIAI
+            if constexpr (
+                std::is_same<T, float16>::value ||
+                std::is_same<T, float>::value) {
+              gp.A = const_cast<float*>(&A[m2 * k + k_ind]);
+            } else {
+#endif
+              PackA(
+                  kernel_nrows, kb, &A[m2 * k + k_ind], k, scratchpad->data());
+              gp.A = scratchpad->data();
+#ifdef FBGEMM_ENABLE_KLEIDIAI
+            }
+#endif
+          } else {
+            // When m == 1, it is actually vector matrix multiplication. We
+            // don't need to do the transposition for packA here. Instead, we
+            // can just pass the pointer of the original A matrix buffer to the
+            // packed A buffer.
+            gp.A = const_cast<float*>(&A[k_ind]);
+          }
+          int nbcol = n / Bp.blockColSize();
+          gp.k = kb;
+          gp.B = &(Bp(k_ind, 0));
+          gp.beta = beta_;
+          gp.C = &C[m2 * ldc];
+          gp.ldc = ldc * sizeof(C[0]);
+          gp.b_block_cols = nbcol;
+#ifdef FBGEMM_ENABLE_KLEIDIAI
+          if constexpr (
+              std::is_same<T, float16>::value ||
+              std::is_same<T, float>::value) {
+            gp.lda = k * sizeof(A[0]);
+          } else {
+#endif
+            gp.b_block_size = gp.k * Bp.blockColSize() * sizeof(gp.B[0]);
+#ifdef FBGEMM_ENABLE_KLEIDIAI
+          }
+#endif
+          if ((n % Bp.blockColSize()) == 0) {
+            int64_t jb_begin = 0, jb_end = 0;
+            fbgemmPartition1D(
+                thread_id, num_threads, gp.b_block_cols, jb_begin, jb_end);
+            gp.B += gp.k * Bp.blockColSize() * jb_begin;
+            gp.C += Bp.blockColSize() * jb_begin;
+            gp.b_block_cols = jb_end - jb_begin;
+            if (gp.b_block_cols) {
+#ifdef FBGEMM_USE_REF_KERNEL
+              ref_kernel<T>(kernel_nrows, &gp, C, m, n, simd_width);
+#else
+              kernels[kernel_nrows](&gp);
+#endif
+            }
+          } else {
+            int last_blk_col = nbcol * Bp.blockColSize();
+            if (nbcol) {
+              int64_t jb_begin = 0, jb_end = 0;
+              fbgemmPartition1D(
+                  thread_id, num_threads, gp.b_block_cols, jb_begin, jb_end);
+              gp.B += gp.k * Bp.blockColSize() * jb_begin;
+              gp.C += Bp.blockColSize() * jb_begin;
+              gp.b_block_cols = jb_end - jb_begin;
+              if (gp.b_block_cols) {
+#ifdef FBGEMM_USE_REF_KERNEL
+                ref_kernel(kernel_nrows, &gp, C, m, n, simd_width);
+#else
+                kernels[kernel_nrows](&gp);
+#endif
+              }
+            }
+            // use one thread to handle the fringe cases
+            if (thread_id == num_threads - 1) {
+              // leftover
+              const int rem [[maybe_unused]] = n - last_blk_col;
+              assert(rem < Bp.blockColSize());
+              // small temporary buffer: the size should be larger than the
+              // required kernel_nrow x kernel_ncols elements computed in the
+              // registers.
+              std::array<float, 14 * 32> c_tmp{0.f};
+              assert(
+                  static_cast<int64_t>(c_tmp.size()) >=
+                  kernel_nrows * Bp.blockColSize());
+              gp.B = &(Bp(k_ind, last_blk_col));
+              gp.C = c_tmp.data();
+              gp.ldc = Bp.blockColSize() * sizeof(C[0]);
+              gp.b_block_cols = 1;
+#ifdef FBGEMM_USE_REF_KERNEL
+              ref_kernel<T>(
+                  kernel_nrows, &gp, c_tmp.data(), 14, 32, simd_width);
+#else
+              kernels[kernel_nrows](&gp);
+#endif
+              for (int i = 0; i < kernel_nrows; i++) {
+                // Todo: use assembly
+                for (int j = last_blk_col; j < n; j++) {
+                  assert(
+                      i * Bp.blockColSize() + (j - last_blk_col) <
+                      static_cast<int64_t>(sizeof(c_tmp) / sizeof(c_tmp[0])));
+                  if (beta_ == 0.f) {
+                    C[(m2 + i) * ldc + j] =
+                        c_tmp[i * Bp.blockColSize() + (j - last_blk_col)];
+                  } else {
+                    C[(m2 + i) * ldc + j] = beta_ * C[(m2 + i) * ldc + j] +
+                        c_tmp[i * Bp.blockColSize() + (j - last_blk_col)];
+                  }
+                }
+              }
+            }
+          }
+        }
+        m1 += kernel_nrows * nkernel_nrows;
+      }
+    }
+  }
+}
+#endif
+#undef FBGEMM_USE_REF_KERNEL
+} // namespace fbgemm
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmI64.h ADDED Viewed

	@@ -0,0 +1,36 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <cstdint>
+#include "fbgemm/Utils.h"
+namespace fbgemm {
+FBGEMM_API void cblas_gemm_i64_i64acc(
+    matrix_op_t transa,
+    matrix_op_t transb,
+    int M,
+    int N,
+    int K,
+    const std::int64_t* A,
+    int lda,
+    const std::int64_t* B,
+    int ldb,
+    bool accumulate,
+    std::int64_t* C,
+    int ldc);
+} // namespace fbgemm
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)

URSA/.venv_ursa/lib/python3.12/site-packages/torch/include/fbgemm/FbgemmI8DepthwiseAvx2.h ADDED Viewed

	@@ -0,0 +1,117 @@

+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <cstdint>
+#include "fbgemm/ConvUtils.h"
+#include "fbgemm/FbgemmBuild.h"
+#include "fbgemm/UtilsAvx2.h"
+namespace fbgemm {
+class FBGEMM_API PackedDepthWiseConvMatrix {
+ public:
+  /**
+   * @param IC the number of input channels (same as the number of groups
+   *           because depth-wise convolution has one input channel per group)
+   * @param OC the number of output channels
+   * @param kernel_prod the product of all kernels. For example, kernel_prod =
+   *                    9 for 3x3 conv, and 27 for 3x3x3 conv.
+   * @param smat the source unpacked weight in GRS layout
+   */
+  PackedDepthWiseConvMatrix(int OC, int kernel_prod, const std::int8_t* smat);
+  PackedDepthWiseConvMatrix(const PackedDepthWiseConvMatrix&) = delete;
+  PackedDepthWiseConvMatrix(PackedDepthWiseConvMatrix&&) = delete;
+  PackedDepthWiseConvMatrix& operator=(const PackedDepthWiseConvMatrix&) =
+      delete;
+  PackedDepthWiseConvMatrix& operator=(PackedDepthWiseConvMatrix&&) = delete;
+  virtual ~PackedDepthWiseConvMatrix();
+  const std::int8_t* PackedMat() const {
+    return pmat_;
+  }
+  int GetKernelProduct() const {
+    return kernel_prod_;
+  }
+  /**
+   * @brief Unpacks pmat_ into unpack_data.
+   * Used for recovering the weight matrix into the original format
+   */
+  void unpack(std::int8_t* unpacked_data);
+  /**
+   * @brief returns the index into pmat_ given the row and column for smat
+   */
+  int addr(int r, int c);
+ private:
+  const int OC_; /**< the number of output channels */
+  const int kernel_prod_; /** the product of all kernel dims */
+  std::int8_t* pmat_; /** packed weight */
+}; // PackedDepthWiseConvMatrix
+/**
+ * Depth-wise convolution that results in the same output feature size as the
+ * input feature. That is PAD_T = PAD_B = (R - 1) / 2 and PAD_L = PAD_R =
+ * (S - 1) / 2. This function also does requantization.
+ * @param col_offsets nullptr if col_offsets are folded into bias
+ * @param act_times_w_scale Only used if BIAS_TYPE is float, i.e., bias is
+ *                          unquantized.
+ */
+template <QuantizationGranularity Q_GRAN, typename BIAS_TYPE = std::int32_t>
+FBGEMM_API void depthwise_2d_same_pad(
+    int N,
+    int H,
+    int W,
+    int IC,
+    int OC,
+    int stride_h,
+    int stride_w,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const PackedDepthWiseConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const BIAS_TYPE* bias,
+    bool fuse_relu = false,
+    const float* act_times_w_scale = nullptr,
+    int thread_id = 0,
+    int num_threads = 1);
+/**
+ * @param col_offsets nullptr if col_offsets are folded into bias
+ */
+template <QuantizationGranularity Q_GRAN, typename BIAS_TYPE = std::int32_t>
+FBGEMM_API void depthwise_3d_same_pad(
+    const conv_param_t<3>& conv_p,
+    std::int32_t A_zero_point,
+    const std::uint8_t* A,
+    const std::int32_t* B_zero_point,
+    const PackedDepthWiseConvMatrix& Bp,
+    const float* C_multiplier,
+    std::int32_t C_zero_point,
+    std::uint8_t* C,
+    const std::int32_t* col_offsets,
+    const BIAS_TYPE* bias,
+    bool fuse_relu = false,
+    const float* act_times_w_scale = nullptr,
+    int thread_id = 0,
+    int num_threads = 1);
+} // namespace fbgemm
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)