diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/ATenGeneral.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/ATenGeneral.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b787a2163e87c903ce0bd034b424eb1773c644d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/ATenGeneral.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/ATen_fwd.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/ATen_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..a66523ab183923e1c15862268b1b0260ccfb9e14
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/ATen_fwd.h
@@ -0,0 +1,46 @@
+#pragma once
+#include <c10/core/QScheme.h>
+
+// Forward declarations of core ATen types used in dispatch functions
+namespace c10 {
+
+template<typename T>
+class List;
+template<typename T>
+class IListRef;
+class Stream;
+class Scalar;
+class SymInt;
+class SymIntList;
+struct Storage;
+struct TensorOptions;
+template <typename T>
+class ArrayRef;
+template <typename T>
+class OptionalArrayRef;
+
+}  // namespace c10
+
+namespace at {
+
+class Tensor;
+class OptionalTensorRef;
+struct Dimname;
+struct Generator;
+using TensorList = c10::ArrayRef<Tensor>;
+using ITensorListRef = c10::IListRef<Tensor>;
+using IOptTensorListRef = c10::IListRef<OptionalTensorRef>;
+using DimnameList = c10::ArrayRef<Dimname>;
+using IntArrayRef = c10::ArrayRef<int64_t>;
+using OptionalIntArrayRef = c10::OptionalArrayRef<int64_t>;
+using OptionalSymIntArrayRef = c10::OptionalArrayRef<c10::SymInt>;
+
+using c10::Stream;
+using c10::Storage;
+using c10::QScheme;
+using c10::Scalar;
+using c10::SymInt;
+using c10::SymIntList;
+using c10::TensorOptions;
+
+}  // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/Dict_inl.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/Dict_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0419b3bd49e916ed30ee3ac7746739e9b99a0468
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/Dict_inl.h
@@ -0,0 +1,209 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/hash.h>
+
+namespace c10 {
+namespace detail {
+inline bool DictKeyEqualTo::operator()(const IValue& lhs, const IValue& rhs) const {
+  if (lhs.isTensor() && rhs.isTensor()) {
+    // for tensors, we compare only by identity (following how it's done in Python).
+    return lhs.is(rhs);
+  }
+  // Otherwise, we first compare by identity for efficiency, then by value (see:
+  // [container equality])
+  return _fastEqualsForContainer(lhs, rhs);
+}
+}
+
+template<class T> decltype(auto) getTypePtr();
+std::string toString(const Type& type);
+
+namespace impl {
+
+template<class Key, class Value>
+Dict<Key, Value> toTypedDict(GenericDict dict) {
+  TORCH_INTERNAL_ASSERT(*getTypePtr<Key>() == *dict.impl_->elementTypes.keyType, "Tried to cast a Dict<", toString(*dict.impl_->elementTypes.keyType), ", ", toString(*dict.impl_->elementTypes.valueType) ,"> to a Dict<", toString(*getTypePtr<Key>()), ", ", toString(*getTypePtr<Value>()), ">. Key types mismatch.");
+  TORCH_INTERNAL_ASSERT(*getTypePtr<Value>() == *dict.impl_->elementTypes.valueType, "Tried to cast a Dict<", toString(*dict.impl_->elementTypes.keyType), ", ", toString(*dict.impl_->elementTypes.valueType) ,"> to a Dict<", toString(*getTypePtr<Key>()), ", ", toString(*getTypePtr<Value>()), ">. Value types mismatch.");
+
+  return Dict<Key, Value>(std::move(dict.impl_));
+}
+
+template<class Key, class Value>
+GenericDict toGenericDict(Dict<Key, Value> dict) {
+  return GenericDict(std::move(dict.impl_));
+}
+}
+
+namespace detail {
+
+inline size_t DictKeyHash::operator()(const IValue& ivalue) const {
+  if (ivalue.isInt()) {
+    return std::hash<int64_t>()(ivalue.toInt());
+  } else if (ivalue.isString()) {
+    return std::hash<c10::string_view>()(ivalue.toStringView());
+  } else if (ivalue.isDouble()) {
+    return std::hash<double>()(ivalue.toDouble());
+  } else if (ivalue.isComplexDouble()) {
+    return c10::hash<c10::complex<double>>()(ivalue.toComplexDouble());
+  } else if (ivalue.isBool()) {
+    return std::hash<bool>()(ivalue.toBool());
+  } else if (ivalue.isTensor()) {
+    return std::hash<TensorImpl*>()(ivalue.toTensor().unsafeGetTensorImpl());
+  } else if (ivalue.isDevice()) {
+    return std::hash<Device>()(ivalue.toDevice());
+  } else {
+    throw std::runtime_error(
+        "Can't hash IValues with tag '" + ivalue.tagKind() + "'");
+  }
+}
+
+inline intrusive_ptr<DictImpl> DictImpl::copy() const {
+  return make_intrusive<DictImpl>(dict, elementTypes);
+}
+
+}
+
+template<class Key, class Value>
+Dict<Key, Value>::Dict()
+  :Dict(make_intrusive<detail::DictImpl>(
+      detail::DictImpl::dict_map_type(),
+      detail::DictImpl::DictElementTypes{getTypePtr<Key>(), getTypePtr<Value>()})) {
+  static_assert(!std::is_same<Key, IValue>::value, "This constructor is not valid for Dict<IValue, _>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
+  static_assert(!std::is_same<Value, IValue>::value, "This constructor is not valid for Dict<_, IValue>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
+}
+
+template<class Key, class Value>
+Dict<Key, Value>::Dict(TypePtr keyType, TypePtr valueType)
+: Dict(make_intrusive<detail::DictImpl>(
+    detail::DictImpl::dict_map_type(),
+    detail::DictImpl::DictElementTypes {std::move(keyType), std::move(valueType)})) {
+  static_assert(std::is_same<Key, IValue>::value, "This constructor is only valid for c10::impl::GenericDict.");
+  static_assert(std::is_same<Value, IValue>::value, "This constructor is only valid for c10::impl::GenericDict.");
+}
+
+template<class Key, class Value>
+Dict<Key, Value>::Dict(c10::intrusive_ptr<detail::DictImpl>&& impl): impl_(std::move(impl)) {}
+
+template<class Key, class Value>
+Dict<Key, Value> Dict<Key, Value>::copy() const {
+  return Dict<Key, Value>(impl_->copy());
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::begin() const {
+  return iterator{impl_->dict.begin()};
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::end() const {
+  return iterator{impl_->dict.end()};
+}
+
+template<class Key, class Value>
+bool Dict<Key, Value>::empty() const {
+  return impl_->dict.empty();
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::size_type Dict<Key, Value>::size() const {
+  return impl_->dict.size();
+}
+
+template<class Key, class Value>
+void Dict<Key, Value>::clear() const {
+  impl_->dict.clear();
+}
+
+template<class Key, class Value>
+template<class Key_, class Value_>
+std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert(Key_&& key, Value_&& value) const {
+  static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert");
+  static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert");
+  auto inserted = impl_->dict.emplace(
+      Key(std::forward<Key_>(key)),
+      Value(std::forward<Value_>(value)));
+  return {iterator{inserted.first}, inserted.second};
+}
+
+template<class Key, class Value>
+template<class Key_, class Value_>
+std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert_or_assign(Key_&& key, Value_&& value) const {
+  static_assert(std::is_constructible<Key, Key_>::value, "Wrong type for the key argument of Dict::insert_or_assign");
+  static_assert(std::is_constructible<Value, Value_>::value, "Wrong type for the value argument of Dict::insert_or_assign");
+  auto inserted = impl_->dict.insert_or_assign(
+    Key(std::forward<Key_>(key)),
+    Value(std::forward<Value_>(value)));
+  return {iterator{inserted.first}, inserted.second};
+}
+
+template<class Key, class Value>
+void Dict<Key, Value>::erase(iterator iter) const {
+  impl_->dict.erase(iter.entryRef_.iterator_);
+}
+
+template<class Key, class Value>
+C10_NODISCARD size_t Dict<Key, Value>::erase(const Key& key) const {
+  return impl_->dict.erase(key);
+}
+
+template<class Key, class Value>
+Value Dict<Key, Value>::at(const Key& key) const {
+  return impl_->dict.at(key).template to<Value>();
+}
+
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::find(const Key& key) const {
+  return iterator{impl_->dict.find(key)};
+}
+
+template<class Key, class Value>
+bool Dict<Key, Value>::contains(const Key& key) const {
+  return end() != find(key);
+}
+
+template<class Key, class Value>
+void Dict<Key, Value>::reserve(size_type count) const {
+  impl_->dict.reserve(count);
+}
+
+template<class Key, class Value>
+TypePtr Dict<Key, Value>::keyType() const {
+  return impl_->elementTypes.keyType;
+}
+
+template<class Key, class Value>
+TypePtr Dict<Key, Value>::valueType() const {
+  return impl_->elementTypes.valueType;
+}
+template <class Key, class Value>
+void Dict<Key, Value>::unsafeSetKeyType(TypePtr t) {
+  impl_->elementTypes.keyType = std::move(t);
+}
+
+template <class Key, class Value>
+void Dict<Key, Value>::unsafeSetValueType(TypePtr t) {
+  impl_->elementTypes.valueType = std::move(t);
+}
+
+template <class Key_, class Value_>
+bool operator==(const Dict<Key_, Value_>& lhs, const Dict<Key_, Value_>& rhs) {
+  // Dicts with the same identity trivially compare equal.
+  if (lhs.impl_ == rhs.impl_) {
+    return true;
+  }
+
+  // Otherwise compare the values
+  return *lhs.impl_ == *rhs.impl_;
+}
+
+template <class Key_, class Value_>
+bool operator!=(const Dict<Key_, Value_>& lhs, const Dict<Key_, Value_>& rhs) {
+  return !(lhs == rhs);
+}
+
+template <class Key, class Value>
+bool Dict<Key, Value>::is(const Dict& rhs) const {
+  return this->impl_ == rhs.impl_;
+}
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/List.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/List.h
new file mode 100644
index 0000000000000000000000000000000000000000..34cdd738b95f10cde702264b83e3100ebd7c6a6d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/List.h
@@ -0,0 +1,488 @@
+#pragma once
+
+#include <ATen/core/ivalue_to.h>
+#include <ATen/core/jit_type_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/macros/Export.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/ArrayRef.h>
+#include <optional>
+#include <vector>
+
+namespace at {
+class Tensor;
+}
+namespace c10 {
+struct IValue;
+template<class T> class List;
+struct Type;
+
+namespace detail {
+
+struct ListImpl final : public c10::intrusive_ptr_target {
+  using list_type = std::vector<IValue>;
+
+  explicit TORCH_API ListImpl(list_type list_, TypePtr elementType_);
+
+  list_type list;
+
+  TypePtr elementType;
+
+  intrusive_ptr<ListImpl> copy() const {
+    return make_intrusive<ListImpl>(list, elementType);
+  }
+  friend TORCH_API bool operator==(const ListImpl& lhs, const ListImpl& rhs);
+};
+}
+
+namespace impl {
+
+template<class T, class Iterator> class ListIterator;
+
+template<class T, class Iterator> class ListElementReference;
+
+template<class T, class Iterator>
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs) noexcept;
+
+template<class T, class Iterator>
+bool operator==(const ListElementReference<T, Iterator>& lhs, const T& rhs);
+
+template<class T, class Iterator>
+bool operator==(const T& lhs, const ListElementReference<T, Iterator>& rhs);
+
+template<class T>
+struct ListElementConstReferenceTraits {
+  // In the general case, we use IValue::to().
+  using const_reference = typename c10::detail::ivalue_to_const_ref_overload_return<T>::type;
+};
+
+// There is no to() overload for std::optional<std::string>.
+template<>
+struct ListElementConstReferenceTraits<std::optional<std::string>> {
+  using const_reference = std::optional<std::reference_wrapper<const std::string>>;
+};
+
+template<class T, class Iterator>
+class ListElementReference final {
+public:
+  operator std::conditional_t<
+      std::is_reference_v<typename c10::detail::
+                            ivalue_to_const_ref_overload_return<T>::type>,
+      const T&,
+      T>() const;
+
+  ListElementReference& operator=(T&& new_value) &&;
+
+  ListElementReference& operator=(const T& new_value) &&;
+
+  // assigning another ref to this assigns the underlying value
+  ListElementReference& operator=(ListElementReference&& rhs) && noexcept;
+
+  const IValue& get() const& {
+    return *iterator_;
+  }
+
+  friend void swap<T, Iterator>(ListElementReference&& lhs, ListElementReference&& rhs) noexcept;
+
+  ListElementReference(const ListElementReference&) = delete;
+  ListElementReference& operator=(const ListElementReference&) = delete;
+
+private:
+  ListElementReference(Iterator iter)
+  : iterator_(iter) {}
+
+  // allow moving, but only our friends (i.e. the List class) can move us
+  ListElementReference(ListElementReference&&) noexcept = default;
+  ListElementReference& operator=(ListElementReference&& rhs) & noexcept {
+    iterator_ = std::move(rhs.iterator_);
+    return *this;
+  }
+
+  friend class List<T>;
+  friend class ListIterator<T, Iterator>;
+
+  Iterator iterator_;
+};
+
+// this wraps vector::iterator to make sure user code can't rely
+// on it being the type of the underlying vector.
+template <class T, class Iterator>
+class ListIterator final {
+ public:
+   // C++17 friendly std::iterator implementation
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T*;
+  using reference = ListElementReference<T, Iterator>;
+
+  explicit ListIterator() = default;
+  ~ListIterator() = default;
+
+  ListIterator(const ListIterator&) = default;
+  ListIterator(ListIterator&&) noexcept = default;
+  ListIterator& operator=(const ListIterator&) = default;
+  ListIterator& operator=(ListIterator&&) noexcept = default;
+
+  ListIterator& operator++() {
+      ++iterator_;
+      return *this;
+  }
+
+  ListIterator operator++(int) {
+      ListIterator copy(*this);
+      ++*this;
+      return copy;
+  }
+
+  ListIterator& operator--() {
+      --iterator_;
+      return *this;
+  }
+
+  ListIterator operator--(int) {
+      ListIterator copy(*this);
+      --*this;
+      return copy;
+  }
+
+  ListIterator& operator+=(typename List<T>::size_type offset) {
+      iterator_ += offset;
+      return *this;
+  }
+
+  ListIterator& operator-=(typename List<T>::size_type offset) {
+      iterator_ -= offset;
+      return *this;
+  }
+
+  ListIterator operator+(typename List<T>::size_type offset) const {
+    return ListIterator{iterator_ + offset};
+  }
+
+  ListIterator operator-(typename List<T>::size_type offset) const {
+    return ListIterator{iterator_ - offset};
+  }
+
+  friend difference_type operator-(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ - rhs.iterator_;
+  }
+
+  ListElementReference<T, Iterator> operator*() const {
+    return {iterator_};
+  }
+
+  ListElementReference<T, Iterator> operator[](typename List<T>::size_type offset) const {
+    return {iterator_ + offset};
+  }
+
+private:
+  explicit ListIterator(Iterator iterator): iterator_(std::move(iterator)) {}
+
+  Iterator iterator_;
+
+  friend bool operator==(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ == rhs.iterator_;
+  }
+
+  friend bool operator!=(const ListIterator& lhs, const ListIterator& rhs) {
+    return !(lhs == rhs);
+  }
+
+  friend bool operator<(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ < rhs.iterator_;
+  }
+
+  friend bool operator<=(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ <= rhs.iterator_;
+  }
+
+  friend bool operator>(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ > rhs.iterator_;
+  }
+
+  friend bool operator>=(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ >= rhs.iterator_;
+  }
+
+  friend class ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  friend class List<T>;
+};
+
+template<class T> List<T> toTypedList(List<IValue> list);
+template<class T> List<IValue> toList(List<T>&& list);
+template<class T> List<IValue> toList(const List<T>& list);
+const IValue* ptr_to_first_element(const List<IValue>& list);
+}
+
+/**
+ * An object of this class stores a list of values of type T.
+ *
+ * This is a pointer type. After a copy, both Lists
+ * will share the same storage:
+ *
+ * > List<int> a;
+ * > List<int> b = a;
+ * > b.push_back("three");
+ * > ASSERT("three" == a.get(0));
+ *
+ * We use this class in the PyTorch kernel API instead of
+ * std::vector<T>, because that allows us to do optimizations
+ * and switch out the underlying list implementation without
+ * breaking backwards compatibility for the kernel API.
+ */
+template<class T>
+class List final {
+private:
+  // This is an intrusive_ptr because List is a pointer type.
+  // Invariant: This will never be a nullptr, there will always be a valid
+  // ListImpl.
+  c10::intrusive_ptr<c10::detail::ListImpl> impl_;
+
+  using internal_reference_type = impl::ListElementReference<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using internal_const_reference_type = typename impl::ListElementConstReferenceTraits<T>::const_reference;
+
+public:
+  using value_type = T;
+  using size_type = typename c10::detail::ListImpl::list_type::size_type;
+  using iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using const_iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using reverse_iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::reverse_iterator>;
+
+  /**
+   * Constructs an empty list.
+   */
+  explicit List();
+
+  /**
+   * Constructs a list with some initial values.
+   * Example:
+   *   List<int> a({2, 3, 4});
+   */
+  List(std::initializer_list<T> initial_values);
+  explicit List(ArrayRef<T> initial_values);
+
+  /**
+   * Create a generic list with runtime type information.
+   * This only works for c10::impl::GenericList and is not part of the public API
+   * but only supposed to be used internally by PyTorch.
+   */
+  explicit List(TypePtr elementType);
+
+  List(const List&) = default;
+  List& operator=(const List&) = default;
+
+  /**
+   * Create a new List pointing to a deep copy of the same data.
+   * The List returned is a new list with separate storage.
+   * Changes in it are not reflected in the original list or vice versa.
+   */
+  List copy() const;
+
+  /**
+   * Returns the element at specified location pos, with bounds checking.
+   * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
+   */
+  internal_const_reference_type get(size_type pos) const;
+
+  /**
+   * Moves out the element at the specified location pos and returns it, with bounds checking.
+   * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
+   * The list contains an invalid element at position pos afterwards. Any operations
+   * on it before re-setting it are invalid.
+   */
+  value_type extract(size_type pos) const;
+
+  /**
+   * Returns a reference to the element at specified location pos, with bounds checking.
+   * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
+   *
+   * You cannot store the reference, but you can read it and assign new values to it:
+   *
+   *   List<int64_t> list = ...;
+   *   list[2] = 5;
+   *   int64_t v = list[1];
+   */
+  internal_const_reference_type operator[](size_type pos) const;
+
+  internal_reference_type operator[](size_type pos);
+
+  /**
+   * Assigns a new value to the element at location pos.
+   */
+  void set(size_type pos, const value_type& value) const;
+
+  /**
+   * Assigns a new value to the element at location pos.
+   */
+  void set(size_type pos, value_type&& value) const;
+
+  /**
+   * Returns an iterator to the first element of the container.
+   * If the container is empty, the returned iterator will be equal to end().
+   */
+  iterator begin() const;
+
+  /**
+   * Returns an iterator to the element following the last element of the container.
+   * This element acts as a placeholder; attempting to access it results in undefined behavior.
+   */
+  iterator end() const;
+
+  /**
+   * Checks if the container has no elements.
+   */
+  bool empty() const;
+
+  /**
+   * Returns the number of elements in the container
+   */
+  size_type size() const;
+
+  /**
+   * Increase the capacity of the vector to a value that's greater or equal to new_cap.
+   */
+  void reserve(size_type new_cap) const;
+
+  /**
+   * Erases all elements from the container. After this call, size() returns zero.
+   * Invalidates any references, pointers, or iterators referring to contained elements. Any past-the-end iterators are also invalidated.
+   */
+  void clear() const;
+
+  /**
+   * Inserts value before pos.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator insert(iterator pos, const T& value) const;
+
+  /**
+   * Inserts value before pos.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator insert(iterator pos, T&& value) const;
+
+  /**
+   * Inserts a new element into the container directly before pos.
+   * The new element is constructed with the given arguments.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  template<class... Args>
+  iterator emplace(iterator pos, Args&&... value) const;
+
+  /**
+   * Appends the given element value to the end of the container.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void push_back(const T& value) const;
+
+  /**
+   * Appends the given element value to the end of the container.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void push_back(T&& value) const;
+
+  /**
+   * Appends the given list to the end of the container. Uses at most one memory allocation.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void append(List<T> lst) const;
+
+  /**
+   * Appends the given element value to the end of the container.
+   * The new element is constructed with the given arguments.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  template<class... Args>
+  void emplace_back(Args&&... args) const;
+
+  /**
+   * Removes the element at pos.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator erase(iterator pos) const;
+
+  /**
+   * Removes the elements in the range [first, last).
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator erase(iterator first, iterator last) const;
+
+  /**
+   * Removes the last element of the container.
+   * Calling pop_back on an empty container is undefined.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void pop_back() const;
+
+  /**
+   * Resizes the container to contain count elements.
+   * If the current size is less than count, additional default-inserted elements are appended.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void resize(size_type count) const;
+
+  /**
+   * Resizes the container to contain count elements.
+   * If the current size is less than count, additional copies of value are appended.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void resize(size_type count, const T& value) const;
+
+  /**
+   * Value equality comparison. This function implements Python-like semantics for
+   * equality: two lists with the same identity (e.g. same pointer) trivially
+   * compare equal, otherwise each element is compared for equality.
+   */
+  template <class T_>
+  friend bool operator==(const List<T_>& lhs, const List<T_>& rhs);
+
+  template <class T_>
+  friend bool operator!=(const List<T_>& lhs, const List<T_>& rhs);
+
+  /**
+   * Identity comparison. Returns true if and only if `rhs` represents the same
+   * List object as `this`.
+   */
+  bool is(const List<T>& rhs) const;
+
+  std::vector<T> vec() const;
+
+  /**
+   * Returns the number of Lists currently pointing to this same list.
+   * If this is the only instance pointing to this list, returns 1.
+   */
+  // TODO Test use_count
+  size_t use_count() const;
+
+  TypePtr elementType() const;
+
+  // See [unsafe set type] for why this exists.
+  void unsafeSetElementType(TypePtr t);
+
+private:
+  explicit List(c10::intrusive_ptr<c10::detail::ListImpl>&& elements);
+  explicit List(const c10::intrusive_ptr<c10::detail::ListImpl>& elements);
+  friend struct IValue;
+  template<class T_> friend List<T_> impl::toTypedList(List<IValue>);
+  template<class T_> friend List<IValue> impl::toList(List<T_>&&);
+  template<class T_> friend List<IValue> impl::toList(const List<T_>&);
+  friend const IValue* impl::ptr_to_first_element(const List<IValue>& list);
+};
+
+namespace impl {
+// GenericList is how IValue stores lists. It is, however, not part of the
+// public API. Kernels should use Lists with concrete types instead
+// (maybe except for some internal prim ops).
+using GenericList = List<IValue>;
+
+}
+}
+
+namespace torch {
+  template<class T> using List = c10::List<T>;
+}
+
+#include <ATen/core/List_inl.h>  // IWYU pragma: keep
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/List_inl.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/List_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d223122599c4f25a976c7099be2f434b6882846
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/List_inl.h
@@ -0,0 +1,353 @@
+#pragma once
+
+#include <ATen/core/jit_type_base.h>
+#include <ATen/core/ivalue.h>
+
+namespace c10 {
+
+template<class T> decltype(auto) getTypePtr();
+std::string toString(const Type& type);
+
+template<class T>
+List<T>::List(c10::intrusive_ptr<c10::detail::ListImpl>&& elements)
+: impl_(std::move(elements)) {}
+
+template<class T>
+List<T>::List(const c10::intrusive_ptr<c10::detail::ListImpl>& elements)
+: impl_(elements) {}
+
+template<class T>
+List<T>::List()
+: List(make_intrusive<c10::detail::ListImpl>(
+  typename c10::detail::ListImpl::list_type(),
+  getTypePtr<T>())) {
+  static_assert(!std::is_same<T, IValue>::value, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType) instead.");
+}
+
+template<class T>
+List<T>::List(ArrayRef<T> values)
+: List(make_intrusive<c10::detail::ListImpl>(
+    typename c10::detail::ListImpl::list_type(),
+    getTypePtr<T>())) {
+  static_assert(!std::is_same<T, IValue>::value, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
+  impl_->list.reserve(values.size());
+  for (const T& element : values) {
+    impl_->list.push_back(element);
+  }
+}
+
+template<class T>
+List<T>::List(std::initializer_list<T> initial_values)
+: List(ArrayRef<T>(initial_values)) {
+  static_assert(!std::is_same<T, IValue>::value, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
+}
+
+template<class T>
+List<T>::List(TypePtr elementType)
+: List(make_intrusive<c10::detail::ListImpl>(
+    typename c10::detail::ListImpl::list_type(),
+    std::move(elementType))) {
+  static_assert(std::is_same<T, IValue>::value || std::is_same<T, c10::intrusive_ptr<ivalue::Future>>::value,
+                "This constructor is only valid for c10::impl::GenericList or List<Future>.");
+}
+
+namespace impl {
+template<class T>
+List<T> toTypedList(impl::GenericList list) {
+  // If there's other instances of the list (i.e. list.use_count() > 1), then we have to be invariant
+  // because upcasting would allow people to add types into the new list that would break the old list.
+  // However, if there aren't any other instances of this list (i.e. list.use_count() == 1), then we can
+  // allow upcasting. This can be a perf improvement since we can cast List<T> to List<optional<T>>
+  // without having to copy it. This is also used to provide backwards compatibility with some old models
+  // that serialized the index arguments to aten::index, aten::index_put, aten::index_put_ and aten::index_put_impl_
+  // as List<Tensor> before we changed that argument to be List<optional<Tensor>>. When deserializing, we
+  // have list.use_count() == 1 and can deserialize the List<Tensor> directly as List<optional<Tensor>>.
+  TORCH_CHECK(*list.impl_->elementType == *getTypePtr<T>()
+    || (list.use_count() == 1 && list.impl_->elementType->isSubtypeOf(*getTypePtr<T>()))
+    , "Tried to cast a List<", toString(*list.impl_->elementType), "> to a List<", toString(*getTypePtr<T>()), ">. Types mismatch.");
+  return List<T>(std::move(list.impl_));
+}
+
+template<class T>
+impl::GenericList toList(List<T>&& list) {
+  return GenericList(std::move(list.impl_));
+}
+template<class T>
+impl::GenericList toList(const List<T>& list) {
+  return GenericList(list.impl_);
+}
+}
+
+template<class T>
+List<T> List<T>::copy() const {
+  return List<T>(impl_->copy());
+}
+
+namespace detail {
+  template<class T>
+  T list_element_to(T element) {
+    return element;
+  }
+  template<class T>
+  T list_element_to(const IValue& element) {
+    return element.template to<T>();
+  }
+  template<class T>
+  T list_element_to(IValue&& element) {
+    return std::move(element).template to<T>();
+  }
+  template<class T>
+  struct ListElementFrom {
+    static IValue from(const T& element) {
+      return element;
+    }
+    static IValue from(T&& element) {
+      return std::move(element);
+    }
+  };
+  template<>
+  struct ListElementFrom<IValue> {
+    static const IValue& from(const IValue& element) {
+      return element;
+    }
+    static IValue&& from(IValue&& element) {
+      return std::move(element);
+    }
+  };
+}
+
+namespace impl {
+
+template <class T, class Iterator>
+ListElementReference<T, Iterator>::operator std::conditional_t<
+    std::is_reference_v<typename c10::detail::ivalue_to_const_ref_overload_return<
+        T>::type>,
+    const T&,
+    T>() const {
+  return iterator_->template to<T>();
+}
+
+template<class T, class Iterator>
+ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(T&& new_value) && {
+  *iterator_ = c10::detail::ListElementFrom<T>::from(std::move(new_value));
+  return *this;
+}
+
+template<class T, class Iterator>
+ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(const T& new_value) && {
+  *iterator_ = c10::detail::ListElementFrom<T>::from(new_value);
+  return *this;
+}
+
+template<class T, class Iterator>
+ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(ListElementReference<T, Iterator>&& rhs) && noexcept {
+  *iterator_ = *rhs.iterator_;
+  return *this;
+}
+
+template<class T, class Iterator>
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs)  noexcept {
+  std::swap(*lhs.iterator_, *rhs.iterator_);
+}
+
+template<class T, class Iterator>
+bool operator==(const ListElementReference<T, Iterator>& lhs, const T& rhs) {
+  const T& lhs_tmp = lhs;
+  return lhs_tmp == rhs;
+}
+
+template<class T, class Iterator>
+inline bool operator==(const T& lhs, const ListElementReference<T, Iterator>& rhs) {
+  return rhs == lhs;
+}
+
+template<class T>
+inline typename ListElementConstReferenceTraits<T>::const_reference
+list_element_to_const_ref(const IValue& element) {
+  return element.template to<T>();
+}
+
+template<>
+inline typename ListElementConstReferenceTraits<std::optional<std::string>>::const_reference
+list_element_to_const_ref<std::optional<std::string>>(const IValue& element) {
+  return element.toOptionalStringRef();
+}
+
+} // namespace impl
+
+template<class T>
+void List<T>::set(size_type pos, const value_type& value) const {
+  impl_->list.at(pos) = c10::detail::ListElementFrom<T>::from(value);
+}
+
+template<class T>
+void List<T>::set(size_type pos, value_type&& value) const {
+  impl_->list.at(pos) = c10::detail::ListElementFrom<T>::from(std::move(value));
+}
+
+template<class T>
+typename List<T>::internal_const_reference_type List<T>::get(size_type pos) const {
+  return operator[](pos);
+}
+
+template<class T>
+typename List<T>::internal_const_reference_type List<T>::operator[](size_type pos) const {
+  return c10::impl::list_element_to_const_ref<T>(impl_->list.at(pos));
+}
+
+template<class T>
+typename List<T>::internal_reference_type List<T>::operator[](size_type pos) {
+  static_cast<void>(impl_->list.at(pos)); // Throw the exception if it is out of range.
+  return {impl_->list.begin() + static_cast<typename decltype(impl_->list)::difference_type>(pos)};
+}
+
+template<class T>
+typename List<T>::value_type List<T>::extract(size_type pos) const {
+  auto& elem = impl_->list.at(pos);
+  auto result = c10::detail::list_element_to<T>(std::move(elem));
+  // Reset the list element to a T() instead of None to keep it correctly typed
+  elem = c10::detail::ListElementFrom<T>::from(T{});
+  return result;
+}
+
+template<class T>
+typename List<T>::iterator List<T>::begin() const {
+  return iterator(impl_->list.begin());
+}
+
+template<class T>
+typename List<T>::iterator List<T>::end() const {
+  return iterator(impl_->list.end());
+}
+
+template<class T>
+bool List<T>::empty() const {
+  return impl_->list.empty();
+}
+
+template<class T>
+typename List<T>::size_type List<T>::size() const {
+  return impl_->list.size();
+}
+
+template<class T>
+void List<T>::reserve(size_type new_cap) const {
+  impl_->list.reserve(new_cap);
+}
+
+template<class T>
+void List<T>::clear() const {
+  impl_->list.clear();
+}
+
+template<class T>
+typename List<T>::iterator List<T>::insert(iterator pos, const T& value) const {
+  return iterator { impl_->list.insert(pos.iterator_, c10::detail::ListElementFrom<T>::from(value)) };
+}
+
+template<class T>
+typename List<T>::iterator List<T>::insert(iterator pos, T&& value) const {
+  return iterator { impl_->list.insert(pos.iterator_, c10::detail::ListElementFrom<T>::from(std::move(value))) };
+}
+
+template<class T>
+template<class... Args>
+typename List<T>::iterator List<T>::emplace(iterator pos, Args&&... value) const {
+  // TODO Use list_element_from?
+  return iterator { impl_->list.emplace(pos.iterator_, std::forward<Args>(value)...) };
+}
+
+template<class T>
+void List<T>::push_back(const T& value) const {
+  impl_->list.push_back(c10::detail::ListElementFrom<T>::from(value));
+}
+
+template<class T>
+void List<T>::push_back(T&& value) const {
+  impl_->list.push_back(c10::detail::ListElementFrom<T>::from(std::move(value)));
+}
+
+template<class T>
+void List<T>::append(List<T> b) const {
+  if (b.use_count() == 1) {
+    impl_->list.insert(impl_->list.end(), make_move_iterator(b.impl_->list.begin()), make_move_iterator(b.impl_->list.end()));
+  } else {
+    impl_->list.insert(impl_->list.end(), b.impl_->list.begin(), b.impl_->list.end());
+  }
+}
+
+template<class T>
+template<class... Args>
+void List<T>::emplace_back(Args&&... args) const {
+  // TODO Use list_element_from?
+  impl_->list.push_back(T(std::forward<Args>(args)...));
+}
+
+template<class T>
+typename List<T>::iterator List<T>::erase(iterator pos) const {
+  return iterator { impl_->list.erase(pos.iterator_) };
+}
+
+template<class T>
+typename List<T>::iterator List<T>::erase(iterator first, iterator last) const {
+  return iterator { impl_->list.erase(first.iterator_, last.iterator_) };
+}
+
+template<class T>
+void List<T>::pop_back() const {
+  impl_->list.pop_back();
+}
+
+template<class T>
+void List<T>::resize(size_type count) const {
+  impl_->list.resize(count, T{});
+}
+
+template<class T>
+void List<T>::resize(size_type count, const T& value) const {
+  impl_->list.resize(count, value);
+}
+
+template<class T>
+bool operator==(const List<T>& lhs, const List<T>& rhs) {
+  // Lists with the same identity trivially compare equal.
+  if (lhs.impl_ == rhs.impl_) {
+    return true;
+  }
+
+  // Otherwise, just compare values directly.
+  return *lhs.impl_ == *rhs.impl_;
+}
+
+template<class T>
+bool operator!=(const List<T>& lhs, const List<T>& rhs) {
+  return !(lhs == rhs);
+}
+
+template<class T>
+bool List<T>::is(const List<T>& rhs) const {
+  return this->impl_ == rhs.impl_;
+}
+
+template<class T>
+std::vector<T> List<T>::vec() const {
+  std::vector<T> result(begin(), end());
+  return result;
+}
+
+template<class T>
+size_t List<T>::use_count() const {
+  return impl_.use_count();
+}
+
+template <class T>
+TypePtr List<T>::elementType() const {
+  return impl_->elementType;
+}
+
+template <class T>
+void List<T>::unsafeSetElementType(TypePtr t) {
+  impl_->elementType = std::move(t);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/UndefinedTensorImpl.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/UndefinedTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..885f6e195f05d37ab4253315242167f8e546dcc1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/UndefinedTensorImpl.h
@@ -0,0 +1 @@
+#include <c10/core/UndefinedTensorImpl.h>
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/Variadic.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/Variadic.h
new file mode 100644
index 0000000000000000000000000000000000000000..da4df1b1b1a6628f76852d1012c7451fbdd85c3e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/Variadic.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <utility>
+
+#include <c10/util/ArrayRef.h>
+#include <ATen/core/List.h>
+
+namespace at {
+
+// This class allows you to write variadic functions which
+// call a (possibly overloaded) function on each argument,
+// in order.  This is most commonly used in autogenerated code,
+// where it is convenient to have a function that can uniformly
+// take arguments of different types.  If your arguments
+// are homogenous consider using a std::initializer_list instead.
+//
+// For examples of this in use, see torch/csrc/utils/variadic.h
+template <typename F>
+struct IterArgs {
+  template <typename... Args>
+  inline F& apply() {
+    return self();
+  }
+
+  // NB: Use perfect forwarding here, otherwise we'll make value
+  // copies of all arguments!
+  template <typename T, typename... Args>
+  inline F& apply(T&& arg, Args&&... args) {
+    self()(std::forward<T>(arg));
+    if (self().short_circuit()) {
+      return self();
+    } else {
+      return apply(std::forward<Args>(args)...);
+    }
+  }
+
+  // Here are some handy overloads which provide sensible
+  // defaults for container-like structures that one might
+  // be interested in recursing into.  You can enable them
+  // by adding:
+  //
+  //    using IterArgs<YourStructName>::operator()
+  //
+  // to your struct.  These are not enabled by default because
+  // you may be able to process these structures more efficiently
+  // than handling them one-by-one.
+
+  template <typename T>
+  void operator()(c10::IListRef<T> args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+
+  template <typename T>
+  void operator()(at::ArrayRef<T> args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+
+  template <typename T>
+  void operator()(const torch::List<T>& args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+
+  // NB: we need to specify std::vector manually as C++ won't
+  // do an implicit conversion to make a template deduction go through.
+  template <typename T>
+  void operator()(const std::vector<T>& args) {
+    self()(at::ArrayRef<T>{args});
+  }
+
+  constexpr bool short_circuit() const {
+    return false;
+  }
+
+ private:
+  inline F& self() {
+    return *static_cast<F*>(this);
+  }
+};
+
+} // namespace torch
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/Vitals.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/Vitals.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a7a51e81e1d273f0f3e3d581db0e7022a57b330
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/Vitals.h
@@ -0,0 +1,91 @@
+#pragma once
+#include <ostream>
+#include <sstream>
+#include <unordered_map>
+
+#include <c10/core/impl/LocalDispatchKeySet.h>
+
+namespace at::vitals {
+
+TORCH_API bool torchVitalEnabled();
+
+struct TORCH_API TorchVitalAttr {
+  // always initialized to empty
+  std::string value = "";
+  template <typename T>
+  TorchVitalAttr& operator<<(const T& t) {
+    if (torchVitalEnabled()) {
+      std::stringstream ss;
+      ss << t;
+      value += ss.str();
+    }
+    return *this;
+  }
+
+  template <typename T>
+  void write(const T& t, bool force) {
+    if (force || torchVitalEnabled()) {
+      std::stringstream ss;
+      ss << t;
+      value = ss.str();
+    }
+  }
+};
+
+struct TORCH_API TorchVital {
+  std::string name;
+  std::unordered_map<std::string, TorchVitalAttr> attrs;
+
+  explicit TorchVital(std::string n) : name(std::move(n)) {}
+  TorchVital(const TorchVital&) = default;
+  TorchVital(TorchVital&&) = default;
+  TorchVital() = delete;
+
+  TorchVitalAttr& create(const std::string& attr);
+  TorchVitalAttr& create(const std::string& attr, bool force);
+  friend std::ostream& operator<<(std::ostream& os, const TorchVital& dt);
+
+  ~TorchVital();
+};
+
+std::ostream& operator<<(std::ostream& os, TorchVital const& tv);
+
+// A way to access vitals by string names instead of by global reference.
+// This enables access to vitals from the PythonAPI.
+class TORCH_API APIVitals {
+ public:
+  bool vitals_enabled;
+
+  // Set any vital sign that was added to the map.
+  bool setVital(
+      const std::string& vital_name,
+      const std::string& attr_name,
+      const std::string& value,
+      bool force = false);
+  std::string readVitals();
+
+  APIVitals();
+
+  // Ensure this stays a singleton
+  APIVitals(APIVitals const& other) = delete;
+  APIVitals(APIVitals&& other) = delete;
+  APIVitals& operator=(const APIVitals&) = delete;
+  APIVitals& operator=(APIVitals&&) = delete;
+
+ private:
+  std::unordered_map<std::string, TorchVital> name_map_;
+};
+
+extern TORCH_API APIVitals VitalsAPI;
+
+} // namespace at::vitals
+
+#define TORCH_VITAL_DECLARE(name) \
+  TORCH_API at::vitals::TorchVital TorchVital_##name;
+
+#define TORCH_VITAL_DEFINE(name) \
+  TORCH_API at::vitals::TorchVital TorchVital_##name(#name);
+
+#define TORCH_VITAL_BASE(name) TorchVital_##name
+
+#define TORCH_VITAL(name, attr) TORCH_VITAL_BASE(name).create(#attr)
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/aten_interned_strings.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/aten_interned_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..00f9a90587b8266c4285972389a3636aa3443a6a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/aten_interned_strings.h
@@ -0,0 +1,2264 @@
+#pragma once
+
+// @generated by torchgen/gen.py from aten_interned_strings.h
+
+#if defined(TORCH_ASSERT_NO_OPERATORS) || defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on native_functions.yaml,          \
+  meaning the file will need to be re-compiled every time an operator   \
+  is changed or added. Consider if including <ATen/core/symbol.h> for   \
+  the c10::Symbol class would be sufficient, or if your change would be \
+  better placed in another file.
+#endif
+
+// ATen symbols correspond exactly to operators defined in ATen. Every
+// symbol here corresponds exactly to an ATen operation defined in
+// native_functions.yaml; attributes are in one-to-one correspondence
+// with their ATen name.
+
+#define FORALL_ATEN_BASE_SYMBOLS(_) \
+_(aten, __and__) \
+_(aten, __iand__) \
+_(aten, __ilshift__) \
+_(aten, __ior__) \
+_(aten, __irshift__) \
+_(aten, __ixor__) \
+_(aten, __lshift__) \
+_(aten, __or__) \
+_(aten, __rshift__) \
+_(aten, __xor__) \
+_(aten, _adaptive_avg_pool2d) \
+_(aten, _adaptive_avg_pool2d_backward) \
+_(aten, _adaptive_avg_pool3d) \
+_(aten, _adaptive_avg_pool3d_backward) \
+_(aten, _add_batch_dim) \
+_(aten, _add_relu) \
+_(aten, _add_relu_) \
+_(aten, _addmm_activation) \
+_(aten, _aminmax) \
+_(aten, _amp_foreach_non_finite_check_and_unscale) \
+_(aten, _amp_foreach_non_finite_check_and_unscale_) \
+_(aten, _amp_update_scale) \
+_(aten, _amp_update_scale_) \
+_(aten, _assert_async) \
+_(aten, _assert_scalar) \
+_(aten, _assert_tensor_metadata) \
+_(aten, _autocast_to_full_precision) \
+_(aten, _autocast_to_reduced_precision) \
+_(aten, _backward) \
+_(aten, _batch_norm_impl_index) \
+_(aten, _batch_norm_impl_index_backward) \
+_(aten, _batch_norm_no_update) \
+_(aten, _batch_norm_with_update) \
+_(aten, _batch_norm_with_update_functional) \
+_(aten, _cast_Byte) \
+_(aten, _cast_Char) \
+_(aten, _cast_Double) \
+_(aten, _cast_Float) \
+_(aten, _cast_Half) \
+_(aten, _cast_Int) \
+_(aten, _cast_Long) \
+_(aten, _cast_Short) \
+_(aten, _cdist_backward) \
+_(aten, _cdist_forward) \
+_(aten, _cholesky_solve_helper) \
+_(aten, _choose_qparams_per_tensor) \
+_(aten, _chunk_cat) \
+_(aten, _coalesce) \
+_(aten, _coalesced) \
+_(aten, _coalesced_) \
+_(aten, _compute_linear_combination) \
+_(aten, _conj) \
+_(aten, _conj_copy) \
+_(aten, _conj_physical) \
+_(aten, _conv_depthwise2d) \
+_(aten, _convert_indices_from_coo_to_csr) \
+_(aten, _convert_indices_from_csr_to_coo) \
+_(aten, _convert_weight_to_int4pack) \
+_(aten, _convolution) \
+_(aten, _convolution_double_backward) \
+_(aten, _convolution_mode) \
+_(aten, _copy_from) \
+_(aten, _copy_from_and_resize) \
+_(aten, _cslt_compress) \
+_(aten, _cslt_sparse_mm) \
+_(aten, _cslt_sparse_mm_search) \
+_(aten, _ctc_loss) \
+_(aten, _ctc_loss_backward) \
+_(aten, _cudnn_ctc_loss) \
+_(aten, _cudnn_init_dropout_state) \
+_(aten, _cudnn_rnn) \
+_(aten, _cudnn_rnn_backward) \
+_(aten, _cudnn_rnn_flatten_weight) \
+_(aten, _cufft_clear_plan_cache) \
+_(aten, _cufft_get_plan_cache_max_size) \
+_(aten, _cufft_get_plan_cache_size) \
+_(aten, _cufft_set_plan_cache_max_size) \
+_(aten, _cummax_helper) \
+_(aten, _cummin_helper) \
+_(aten, _debug_has_internal_overlap) \
+_(aten, _dimI) \
+_(aten, _dimV) \
+_(aten, _dim_arange) \
+_(aten, _dirichlet_grad) \
+_(aten, _efficient_attention_backward) \
+_(aten, _efficient_attention_forward) \
+_(aten, _efficientzerotensor) \
+_(aten, _embedding_bag) \
+_(aten, _embedding_bag_backward) \
+_(aten, _embedding_bag_dense_backward) \
+_(aten, _embedding_bag_forward_only) \
+_(aten, _embedding_bag_per_sample_weights_backward) \
+_(aten, _embedding_bag_sparse_backward) \
+_(aten, _empty_affine_quantized) \
+_(aten, _empty_per_channel_affine_quantized) \
+_(aten, _euclidean_dist) \
+_(aten, _fake_quantize_learnable_per_channel_affine) \
+_(aten, _fake_quantize_learnable_per_channel_affine_backward) \
+_(aten, _fake_quantize_learnable_per_tensor_affine) \
+_(aten, _fake_quantize_learnable_per_tensor_affine_backward) \
+_(aten, _fake_quantize_per_tensor_affine_cachemask_tensor_qparams) \
+_(aten, _fft_c2c) \
+_(aten, _fft_c2r) \
+_(aten, _fft_r2c) \
+_(aten, _fill_mem_eff_dropout_mask) \
+_(aten, _fill_mem_eff_dropout_mask_) \
+_(aten, _flash_attention_backward) \
+_(aten, _flash_attention_forward) \
+_(aten, _foobar) \
+_(aten, _foreach_abs) \
+_(aten, _foreach_abs_) \
+_(aten, _foreach_acos) \
+_(aten, _foreach_acos_) \
+_(aten, _foreach_add) \
+_(aten, _foreach_add_) \
+_(aten, _foreach_addcdiv) \
+_(aten, _foreach_addcdiv_) \
+_(aten, _foreach_addcmul) \
+_(aten, _foreach_addcmul_) \
+_(aten, _foreach_asin) \
+_(aten, _foreach_asin_) \
+_(aten, _foreach_atan) \
+_(aten, _foreach_atan_) \
+_(aten, _foreach_ceil) \
+_(aten, _foreach_ceil_) \
+_(aten, _foreach_clamp_max) \
+_(aten, _foreach_clamp_max_) \
+_(aten, _foreach_clamp_min) \
+_(aten, _foreach_clamp_min_) \
+_(aten, _foreach_copy) \
+_(aten, _foreach_copy_) \
+_(aten, _foreach_cos) \
+_(aten, _foreach_cos_) \
+_(aten, _foreach_cosh) \
+_(aten, _foreach_cosh_) \
+_(aten, _foreach_div) \
+_(aten, _foreach_div_) \
+_(aten, _foreach_erf) \
+_(aten, _foreach_erf_) \
+_(aten, _foreach_erfc) \
+_(aten, _foreach_erfc_) \
+_(aten, _foreach_exp) \
+_(aten, _foreach_exp_) \
+_(aten, _foreach_expm1) \
+_(aten, _foreach_expm1_) \
+_(aten, _foreach_floor) \
+_(aten, _foreach_floor_) \
+_(aten, _foreach_frac) \
+_(aten, _foreach_frac_) \
+_(aten, _foreach_lerp) \
+_(aten, _foreach_lerp_) \
+_(aten, _foreach_lgamma) \
+_(aten, _foreach_lgamma_) \
+_(aten, _foreach_log) \
+_(aten, _foreach_log10) \
+_(aten, _foreach_log10_) \
+_(aten, _foreach_log1p) \
+_(aten, _foreach_log1p_) \
+_(aten, _foreach_log2) \
+_(aten, _foreach_log2_) \
+_(aten, _foreach_log_) \
+_(aten, _foreach_max) \
+_(aten, _foreach_maximum) \
+_(aten, _foreach_maximum_) \
+_(aten, _foreach_minimum) \
+_(aten, _foreach_minimum_) \
+_(aten, _foreach_mul) \
+_(aten, _foreach_mul_) \
+_(aten, _foreach_neg) \
+_(aten, _foreach_neg_) \
+_(aten, _foreach_norm) \
+_(aten, _foreach_pow) \
+_(aten, _foreach_pow_) \
+_(aten, _foreach_reciprocal) \
+_(aten, _foreach_reciprocal_) \
+_(aten, _foreach_round) \
+_(aten, _foreach_round_) \
+_(aten, _foreach_sigmoid) \
+_(aten, _foreach_sigmoid_) \
+_(aten, _foreach_sign) \
+_(aten, _foreach_sign_) \
+_(aten, _foreach_sin) \
+_(aten, _foreach_sin_) \
+_(aten, _foreach_sinh) \
+_(aten, _foreach_sinh_) \
+_(aten, _foreach_sqrt) \
+_(aten, _foreach_sqrt_) \
+_(aten, _foreach_sub) \
+_(aten, _foreach_sub_) \
+_(aten, _foreach_tan) \
+_(aten, _foreach_tan_) \
+_(aten, _foreach_tanh) \
+_(aten, _foreach_tanh_) \
+_(aten, _foreach_trunc) \
+_(aten, _foreach_trunc_) \
+_(aten, _foreach_zero) \
+_(aten, _foreach_zero_) \
+_(aten, _functional_assert_async) \
+_(aten, _functional_assert_scalar) \
+_(aten, _functional_sym_constrain_range) \
+_(aten, _functional_sym_constrain_range_for_size) \
+_(aten, _fused_adagrad) \
+_(aten, _fused_adagrad_) \
+_(aten, _fused_adam) \
+_(aten, _fused_adam_) \
+_(aten, _fused_adamw) \
+_(aten, _fused_adamw_) \
+_(aten, _fused_dropout) \
+_(aten, _fused_moving_avg_obs_fq_helper) \
+_(aten, _fused_moving_avg_obs_fq_helper_functional) \
+_(aten, _fused_sdp_choice) \
+_(aten, _fused_sgd) \
+_(aten, _fused_sgd_) \
+_(aten, _fw_primal) \
+_(aten, _fw_primal_copy) \
+_(aten, _gather_sparse_backward) \
+_(aten, _grid_sampler_2d_cpu_fallback) \
+_(aten, _grid_sampler_2d_cpu_fallback_backward) \
+_(aten, _has_compatible_shallow_copy_type) \
+_(aten, _has_same_storage_numel) \
+_(aten, _histogramdd_bin_edges) \
+_(aten, _histogramdd_from_bin_cts) \
+_(aten, _histogramdd_from_bin_tensors) \
+_(aten, _index_put_impl) \
+_(aten, _index_put_impl_) \
+_(aten, _indices) \
+_(aten, _indices_copy) \
+_(aten, _int_mm) \
+_(aten, _is_all_true) \
+_(aten, _is_any_true) \
+_(aten, _is_zerotensor) \
+_(aten, _jagged_to_padded_dense_forward) \
+_(aten, _lazy_clone) \
+_(aten, _linalg_check_errors) \
+_(aten, _linalg_det) \
+_(aten, _linalg_eigh) \
+_(aten, _linalg_eigvals) \
+_(aten, _linalg_slogdet) \
+_(aten, _linalg_solve_ex) \
+_(aten, _linalg_svd) \
+_(aten, _local_scalar_dense) \
+_(aten, _log_softmax) \
+_(aten, _log_softmax_backward_data) \
+_(aten, _logcumsumexp) \
+_(aten, _lstm_mps) \
+_(aten, _lu_with_info) \
+_(aten, _make_dep_token) \
+_(aten, _make_dual) \
+_(aten, _make_dual_copy) \
+_(aten, _make_per_channel_quantized_tensor) \
+_(aten, _make_per_tensor_quantized_tensor) \
+_(aten, _masked_scale) \
+_(aten, _masked_softmax) \
+_(aten, _masked_softmax_backward) \
+_(aten, _mixed_dtypes_linear) \
+_(aten, _mkldnn_reshape) \
+_(aten, _mkldnn_transpose) \
+_(aten, _mkldnn_transpose_) \
+_(aten, _mps_convolution) \
+_(aten, _mps_convolution_transpose) \
+_(aten, _native_batch_norm_legit) \
+_(aten, _native_batch_norm_legit_functional) \
+_(aten, _native_batch_norm_legit_no_training) \
+_(aten, _native_multi_head_attention) \
+_(aten, _neg_view) \
+_(aten, _neg_view_copy) \
+_(aten, _nested_compute_contiguous_strides_offsets) \
+_(aten, _nested_from_padded) \
+_(aten, _nested_from_padded_and_nested_example) \
+_(aten, _nested_get_jagged_dummy) \
+_(aten, _nested_get_lengths) \
+_(aten, _nested_get_max_seqlen) \
+_(aten, _nested_get_min_seqlen) \
+_(aten, _nested_get_offsets) \
+_(aten, _nested_get_ragged_idx) \
+_(aten, _nested_get_values) \
+_(aten, _nested_get_values_copy) \
+_(aten, _nested_select_backward) \
+_(aten, _nested_sum_backward) \
+_(aten, _nested_tensor_from_mask) \
+_(aten, _nested_tensor_from_mask_left_aligned) \
+_(aten, _nested_tensor_from_tensor_list) \
+_(aten, _nested_tensor_size) \
+_(aten, _nested_tensor_softmax_with_shape) \
+_(aten, _nested_tensor_storage_offsets) \
+_(aten, _nested_tensor_strides) \
+_(aten, _nested_view_from_buffer) \
+_(aten, _nested_view_from_buffer_copy) \
+_(aten, _nested_view_from_jagged) \
+_(aten, _nested_view_from_jagged_copy) \
+_(aten, _new_zeros_with_same_feature_meta) \
+_(aten, _nnpack_available) \
+_(aten, _nnpack_spatial_convolution) \
+_(aten, _nnz) \
+_(aten, _pack_padded_sequence) \
+_(aten, _pack_padded_sequence_backward) \
+_(aten, _pad_circular) \
+_(aten, _pad_enum) \
+_(aten, _pad_packed_sequence) \
+_(aten, _padded_dense_to_jagged_forward) \
+_(aten, _pdist_backward) \
+_(aten, _pdist_forward) \
+_(aten, _pin_memory) \
+_(aten, _prelu_kernel) \
+_(aten, _prelu_kernel_backward) \
+_(aten, _print) \
+_(aten, _propagate_xla_data) \
+_(aten, _remove_batch_dim) \
+_(aten, _reshape_alias) \
+_(aten, _reshape_alias_copy) \
+_(aten, _reshape_copy) \
+_(aten, _reshape_from_tensor) \
+_(aten, _resize_output) \
+_(aten, _resize_output_) \
+_(aten, _rowwise_prune) \
+_(aten, _safe_softmax) \
+_(aten, _sample_dirichlet) \
+_(aten, _saturate_weight_to_fp16) \
+_(aten, _scaled_dot_product_attention_math) \
+_(aten, _scaled_dot_product_attention_math_for_mps) \
+_(aten, _scaled_dot_product_cudnn_attention) \
+_(aten, _scaled_dot_product_cudnn_attention_backward) \
+_(aten, _scaled_dot_product_efficient_attention) \
+_(aten, _scaled_dot_product_efficient_attention_backward) \
+_(aten, _scaled_dot_product_flash_attention) \
+_(aten, _scaled_dot_product_flash_attention_backward) \
+_(aten, _scaled_dot_product_flash_attention_for_cpu) \
+_(aten, _scaled_dot_product_flash_attention_for_cpu_backward) \
+_(aten, _scaled_dot_product_fused_attention_overrideable) \
+_(aten, _scaled_dot_product_fused_attention_overrideable_backward) \
+_(aten, _scaled_mm) \
+_(aten, _segment_reduce_backward) \
+_(aten, _shape_as_tensor) \
+_(aten, _slow_conv2d_backward) \
+_(aten, _slow_conv2d_forward) \
+_(aten, _sobol_engine_draw) \
+_(aten, _sobol_engine_ff) \
+_(aten, _sobol_engine_ff_) \
+_(aten, _sobol_engine_initialize_state) \
+_(aten, _sobol_engine_initialize_state_) \
+_(aten, _sobol_engine_scramble) \
+_(aten, _sobol_engine_scramble_) \
+_(aten, _softmax) \
+_(aten, _softmax_backward_data) \
+_(aten, _sparse_addmm) \
+_(aten, _sparse_broadcast_to) \
+_(aten, _sparse_broadcast_to_copy) \
+_(aten, _sparse_bsc_tensor_unsafe) \
+_(aten, _sparse_bsr_tensor_unsafe) \
+_(aten, _sparse_compressed_tensor_unsafe) \
+_(aten, _sparse_compressed_tensor_with_dims) \
+_(aten, _sparse_coo_tensor_unsafe) \
+_(aten, _sparse_coo_tensor_with_dims) \
+_(aten, _sparse_coo_tensor_with_dims_and_tensors) \
+_(aten, _sparse_csc_tensor_unsafe) \
+_(aten, _sparse_csr_prod) \
+_(aten, _sparse_csr_sum) \
+_(aten, _sparse_csr_tensor_unsafe) \
+_(aten, _sparse_log_softmax) \
+_(aten, _sparse_log_softmax_backward_data) \
+_(aten, _sparse_mask_projection) \
+_(aten, _sparse_mm) \
+_(aten, _sparse_mm_reduce_impl) \
+_(aten, _sparse_mm_reduce_impl_backward) \
+_(aten, _sparse_semi_structured_addmm) \
+_(aten, _sparse_semi_structured_apply) \
+_(aten, _sparse_semi_structured_apply_dense) \
+_(aten, _sparse_semi_structured_linear) \
+_(aten, _sparse_semi_structured_mm) \
+_(aten, _sparse_semi_structured_tile) \
+_(aten, _sparse_softmax) \
+_(aten, _sparse_softmax_backward_data) \
+_(aten, _sparse_sparse_matmul) \
+_(aten, _sparse_sum) \
+_(aten, _sparse_sum_backward) \
+_(aten, _spdiags) \
+_(aten, _spsolve) \
+_(aten, _stack) \
+_(aten, _standard_gamma) \
+_(aten, _standard_gamma_grad) \
+_(aten, _test_ambiguous_defaults) \
+_(aten, _test_autograd_multiple_dispatch) \
+_(aten, _test_autograd_multiple_dispatch_view) \
+_(aten, _test_autograd_multiple_dispatch_view_copy) \
+_(aten, _test_check_tensor) \
+_(aten, _test_functorch_fallback) \
+_(aten, _test_optional_filled_intlist) \
+_(aten, _test_optional_floatlist) \
+_(aten, _test_optional_intlist) \
+_(aten, _test_parallel_materialize) \
+_(aten, _test_serialization_subcmul) \
+_(aten, _test_string_default) \
+_(aten, _test_warn_in_autograd) \
+_(aten, _thnn_differentiable_gru_cell_backward) \
+_(aten, _thnn_differentiable_lstm_cell_backward) \
+_(aten, _thnn_fused_gru_cell) \
+_(aten, _thnn_fused_gru_cell_backward) \
+_(aten, _thnn_fused_lstm_cell) \
+_(aten, _thnn_fused_lstm_cell_backward) \
+_(aten, _thnn_fused_lstm_cell_backward_impl) \
+_(aten, _to_copy) \
+_(aten, _to_cpu) \
+_(aten, _to_dense) \
+_(aten, _to_sparse) \
+_(aten, _to_sparse_bsc) \
+_(aten, _to_sparse_bsr) \
+_(aten, _to_sparse_csc) \
+_(aten, _to_sparse_csr) \
+_(aten, _to_sparse_semi_structured) \
+_(aten, _transform_bias_rescale_qkv) \
+_(aten, _transformer_encoder_layer_fwd) \
+_(aten, _trilinear) \
+_(aten, _triton_multi_head_attention) \
+_(aten, _triton_scaled_dot_attention) \
+_(aten, _unique) \
+_(aten, _unique2) \
+_(aten, _unpack_dual) \
+_(aten, _unsafe_index) \
+_(aten, _unsafe_index_put) \
+_(aten, _unsafe_masked_index) \
+_(aten, _unsafe_masked_index_put_accumulate) \
+_(aten, _unsafe_view) \
+_(aten, _upsample_bicubic2d_aa) \
+_(aten, _upsample_bicubic2d_aa_backward) \
+_(aten, _upsample_bilinear2d_aa) \
+_(aten, _upsample_bilinear2d_aa_backward) \
+_(aten, _upsample_nearest_exact1d) \
+_(aten, _upsample_nearest_exact1d_backward) \
+_(aten, _upsample_nearest_exact2d) \
+_(aten, _upsample_nearest_exact2d_backward) \
+_(aten, _upsample_nearest_exact3d) \
+_(aten, _upsample_nearest_exact3d_backward) \
+_(aten, _use_cudnn_ctc_loss) \
+_(aten, _use_cudnn_rnn_flatten_weight) \
+_(aten, _validate_compressed_sparse_indices) \
+_(aten, _validate_sparse_bsc_tensor_args) \
+_(aten, _validate_sparse_bsr_tensor_args) \
+_(aten, _validate_sparse_compressed_tensor_args) \
+_(aten, _validate_sparse_coo_tensor_args) \
+_(aten, _validate_sparse_csc_tensor_args) \
+_(aten, _validate_sparse_csr_tensor_args) \
+_(aten, _values) \
+_(aten, _values_copy) \
+_(aten, _version) \
+_(aten, _weight_int4pack_mm) \
+_(aten, _weight_int8pack_mm) \
+_(aten, _weight_norm) \
+_(aten, _weight_norm_differentiable_backward) \
+_(aten, _weight_norm_interface) \
+_(aten, _weight_norm_interface_backward) \
+_(aten, _wrapped_linear_prepack) \
+_(aten, _wrapped_quantized_linear_prepacked) \
+_(aten, abs) \
+_(aten, abs_) \
+_(aten, absolute) \
+_(aten, absolute_) \
+_(aten, acos) \
+_(aten, acos_) \
+_(aten, acosh) \
+_(aten, acosh_) \
+_(aten, adaptive_avg_pool1d) \
+_(aten, adaptive_avg_pool2d) \
+_(aten, adaptive_avg_pool3d) \
+_(aten, adaptive_avg_pool3d_backward) \
+_(aten, adaptive_max_pool1d) \
+_(aten, adaptive_max_pool2d) \
+_(aten, adaptive_max_pool2d_backward) \
+_(aten, adaptive_max_pool3d) \
+_(aten, adaptive_max_pool3d_backward) \
+_(aten, add) \
+_(aten, add_) \
+_(aten, addbmm) \
+_(aten, addbmm_) \
+_(aten, addcdiv) \
+_(aten, addcdiv_) \
+_(aten, addcmul) \
+_(aten, addcmul_) \
+_(aten, addmm) \
+_(aten, addmm_) \
+_(aten, addmv) \
+_(aten, addmv_) \
+_(aten, addr) \
+_(aten, addr_) \
+_(aten, adjoint) \
+_(aten, affine_grid_generator) \
+_(aten, affine_grid_generator_backward) \
+_(aten, alias) \
+_(aten, alias_copy) \
+_(aten, align_as) \
+_(aten, align_tensors) \
+_(aten, align_to) \
+_(aten, all) \
+_(aten, allclose) \
+_(aten, alpha_dropout) \
+_(aten, alpha_dropout_) \
+_(aten, amax) \
+_(aten, amin) \
+_(aten, aminmax) \
+_(aten, angle) \
+_(aten, any) \
+_(aten, arange) \
+_(aten, arccos) \
+_(aten, arccos_) \
+_(aten, arccosh) \
+_(aten, arccosh_) \
+_(aten, arcsin) \
+_(aten, arcsin_) \
+_(aten, arcsinh) \
+_(aten, arcsinh_) \
+_(aten, arctan) \
+_(aten, arctan2) \
+_(aten, arctan2_) \
+_(aten, arctan_) \
+_(aten, arctanh) \
+_(aten, arctanh_) \
+_(aten, argmax) \
+_(aten, argmin) \
+_(aten, argsort) \
+_(aten, argwhere) \
+_(aten, as_strided) \
+_(aten, as_strided_) \
+_(aten, as_strided_copy) \
+_(aten, as_strided_scatter) \
+_(aten, asin) \
+_(aten, asin_) \
+_(aten, asinh) \
+_(aten, asinh_) \
+_(aten, atan) \
+_(aten, atan2) \
+_(aten, atan2_) \
+_(aten, atan_) \
+_(aten, atanh) \
+_(aten, atanh_) \
+_(aten, atleast_1d) \
+_(aten, atleast_2d) \
+_(aten, atleast_3d) \
+_(aten, avg_pool1d) \
+_(aten, avg_pool2d) \
+_(aten, avg_pool2d_backward) \
+_(aten, avg_pool3d) \
+_(aten, avg_pool3d_backward) \
+_(aten, baddbmm) \
+_(aten, baddbmm_) \
+_(aten, bartlett_window) \
+_(aten, batch_norm) \
+_(aten, batch_norm_backward) \
+_(aten, batch_norm_backward_elemt) \
+_(aten, batch_norm_backward_reduce) \
+_(aten, batch_norm_elemt) \
+_(aten, batch_norm_gather_stats) \
+_(aten, batch_norm_gather_stats_with_counts) \
+_(aten, batch_norm_stats) \
+_(aten, batch_norm_update_stats) \
+_(aten, bernoulli) \
+_(aten, bernoulli_) \
+_(aten, bilinear) \
+_(aten, binary_cross_entropy) \
+_(aten, binary_cross_entropy_backward) \
+_(aten, binary_cross_entropy_with_logits) \
+_(aten, bincount) \
+_(aten, binomial) \
+_(aten, bitwise_and) \
+_(aten, bitwise_and_) \
+_(aten, bitwise_left_shift) \
+_(aten, bitwise_left_shift_) \
+_(aten, bitwise_not) \
+_(aten, bitwise_not_) \
+_(aten, bitwise_or) \
+_(aten, bitwise_or_) \
+_(aten, bitwise_right_shift) \
+_(aten, bitwise_right_shift_) \
+_(aten, bitwise_xor) \
+_(aten, bitwise_xor_) \
+_(aten, blackman_window) \
+_(aten, block_diag) \
+_(aten, bmm) \
+_(aten, broadcast_tensors) \
+_(aten, broadcast_to) \
+_(aten, bucketize) \
+_(aten, can_cast) \
+_(aten, cartesian_prod) \
+_(aten, cat) \
+_(aten, cauchy) \
+_(aten, cauchy_) \
+_(aten, ccol_indices) \
+_(aten, ccol_indices_copy) \
+_(aten, cdist) \
+_(aten, ceil) \
+_(aten, ceil_) \
+_(aten, celu) \
+_(aten, celu_) \
+_(aten, chain_matmul) \
+_(aten, chalf) \
+_(aten, channel_shuffle) \
+_(aten, cholesky) \
+_(aten, cholesky_inverse) \
+_(aten, cholesky_solve) \
+_(aten, choose_qparams_optimized) \
+_(aten, chunk) \
+_(aten, clamp) \
+_(aten, clamp_) \
+_(aten, clamp_max) \
+_(aten, clamp_max_) \
+_(aten, clamp_min) \
+_(aten, clamp_min_) \
+_(aten, clip) \
+_(aten, clip_) \
+_(aten, clone) \
+_(aten, coalesce) \
+_(aten, col2im) \
+_(aten, col_indices) \
+_(aten, col_indices_copy) \
+_(aten, column_stack) \
+_(aten, combinations) \
+_(aten, complex) \
+_(aten, concat) \
+_(aten, concatenate) \
+_(aten, conj) \
+_(aten, conj_physical) \
+_(aten, conj_physical_) \
+_(aten, constant_pad_nd) \
+_(aten, contiguous) \
+_(aten, conv1d) \
+_(aten, conv2d) \
+_(aten, conv3d) \
+_(aten, conv_depthwise3d) \
+_(aten, conv_tbc) \
+_(aten, conv_tbc_backward) \
+_(aten, conv_transpose1d) \
+_(aten, conv_transpose2d) \
+_(aten, conv_transpose3d) \
+_(aten, convolution) \
+_(aten, convolution_backward) \
+_(aten, convolution_backward_overrideable) \
+_(aten, convolution_overrideable) \
+_(aten, copy) \
+_(aten, copy_) \
+_(aten, copy_sparse_to_sparse) \
+_(aten, copy_sparse_to_sparse_) \
+_(aten, copysign) \
+_(aten, copysign_) \
+_(aten, corrcoef) \
+_(aten, cos) \
+_(aten, cos_) \
+_(aten, cosh) \
+_(aten, cosh_) \
+_(aten, cosine_embedding_loss) \
+_(aten, cosine_similarity) \
+_(aten, count_nonzero) \
+_(aten, cov) \
+_(aten, cross) \
+_(aten, cross_entropy_loss) \
+_(aten, crow_indices) \
+_(aten, crow_indices_copy) \
+_(aten, ctc_loss) \
+_(aten, cudnn_affine_grid_generator) \
+_(aten, cudnn_affine_grid_generator_backward) \
+_(aten, cudnn_batch_norm) \
+_(aten, cudnn_batch_norm_backward) \
+_(aten, cudnn_convolution) \
+_(aten, cudnn_convolution_add_relu) \
+_(aten, cudnn_convolution_relu) \
+_(aten, cudnn_convolution_transpose) \
+_(aten, cudnn_grid_sampler) \
+_(aten, cudnn_grid_sampler_backward) \
+_(aten, cudnn_is_acceptable) \
+_(aten, cummax) \
+_(aten, cummaxmin_backward) \
+_(aten, cummin) \
+_(aten, cumprod) \
+_(aten, cumprod_) \
+_(aten, cumprod_backward) \
+_(aten, cumsum) \
+_(aten, cumsum_) \
+_(aten, cumulative_trapezoid) \
+_(aten, data) \
+_(aten, deg2rad) \
+_(aten, deg2rad_) \
+_(aten, dense_dim) \
+_(aten, dequantize) \
+_(aten, det) \
+_(aten, detach) \
+_(aten, detach_) \
+_(aten, detach_copy) \
+_(aten, diag) \
+_(aten, diag_embed) \
+_(aten, diagflat) \
+_(aten, diagonal) \
+_(aten, diagonal_backward) \
+_(aten, diagonal_copy) \
+_(aten, diagonal_scatter) \
+_(aten, diff) \
+_(aten, digamma) \
+_(aten, digamma_) \
+_(aten, dist) \
+_(aten, div) \
+_(aten, div_) \
+_(aten, divide) \
+_(aten, divide_) \
+_(aten, dot) \
+_(aten, dropout) \
+_(aten, dropout_) \
+_(aten, dsplit) \
+_(aten, dstack) \
+_(aten, einsum) \
+_(aten, elu) \
+_(aten, elu_) \
+_(aten, elu_backward) \
+_(aten, embedding) \
+_(aten, embedding_backward) \
+_(aten, embedding_bag) \
+_(aten, embedding_dense_backward) \
+_(aten, embedding_renorm) \
+_(aten, embedding_renorm_) \
+_(aten, embedding_sparse_backward) \
+_(aten, empty) \
+_(aten, empty_like) \
+_(aten, empty_permuted) \
+_(aten, empty_quantized) \
+_(aten, empty_strided) \
+_(aten, eq) \
+_(aten, eq_) \
+_(aten, equal) \
+_(aten, erf) \
+_(aten, erf_) \
+_(aten, erfc) \
+_(aten, erfc_) \
+_(aten, erfinv) \
+_(aten, erfinv_) \
+_(aten, exp) \
+_(aten, exp2) \
+_(aten, exp2_) \
+_(aten, exp_) \
+_(aten, expand) \
+_(aten, expand_as) \
+_(aten, expand_copy) \
+_(aten, expm1) \
+_(aten, expm1_) \
+_(aten, exponential) \
+_(aten, exponential_) \
+_(aten, eye) \
+_(aten, fake_quantize_per_channel_affine) \
+_(aten, fake_quantize_per_channel_affine_cachemask) \
+_(aten, fake_quantize_per_channel_affine_cachemask_backward) \
+_(aten, fake_quantize_per_tensor_affine) \
+_(aten, fake_quantize_per_tensor_affine_cachemask) \
+_(aten, fake_quantize_per_tensor_affine_cachemask_backward) \
+_(aten, fbgemm_linear_fp16_weight) \
+_(aten, fbgemm_linear_fp16_weight_fp32_activation) \
+_(aten, fbgemm_linear_int8_weight) \
+_(aten, fbgemm_linear_int8_weight_fp32_activation) \
+_(aten, fbgemm_linear_quantize_weight) \
+_(aten, fbgemm_pack_gemm_matrix_fp16) \
+_(aten, fbgemm_pack_quantized_matrix) \
+_(aten, feature_alpha_dropout) \
+_(aten, feature_alpha_dropout_) \
+_(aten, feature_dropout) \
+_(aten, feature_dropout_) \
+_(aten, fft_fft) \
+_(aten, fft_fft2) \
+_(aten, fft_fftfreq) \
+_(aten, fft_fftn) \
+_(aten, fft_fftshift) \
+_(aten, fft_hfft) \
+_(aten, fft_hfft2) \
+_(aten, fft_hfftn) \
+_(aten, fft_ifft) \
+_(aten, fft_ifft2) \
+_(aten, fft_ifftn) \
+_(aten, fft_ifftshift) \
+_(aten, fft_ihfft) \
+_(aten, fft_ihfft2) \
+_(aten, fft_ihfftn) \
+_(aten, fft_irfft) \
+_(aten, fft_irfft2) \
+_(aten, fft_irfftn) \
+_(aten, fft_rfft) \
+_(aten, fft_rfft2) \
+_(aten, fft_rfftfreq) \
+_(aten, fft_rfftn) \
+_(aten, fill) \
+_(aten, fill_) \
+_(aten, fill_diagonal) \
+_(aten, fill_diagonal_) \
+_(aten, fix) \
+_(aten, fix_) \
+_(aten, flatten) \
+_(aten, flatten_dense_tensors) \
+_(aten, flip) \
+_(aten, fliplr) \
+_(aten, flipud) \
+_(aten, float_power) \
+_(aten, float_power_) \
+_(aten, floor) \
+_(aten, floor_) \
+_(aten, floor_divide) \
+_(aten, floor_divide_) \
+_(aten, fmax) \
+_(aten, fmin) \
+_(aten, fmod) \
+_(aten, fmod_) \
+_(aten, frac) \
+_(aten, frac_) \
+_(aten, fractional_max_pool2d) \
+_(aten, fractional_max_pool2d_backward) \
+_(aten, fractional_max_pool3d) \
+_(aten, fractional_max_pool3d_backward) \
+_(aten, frexp) \
+_(aten, frobenius_norm) \
+_(aten, from_file) \
+_(aten, full) \
+_(aten, full_like) \
+_(aten, fused_moving_avg_obs_fake_quant) \
+_(aten, gather) \
+_(aten, gather_backward) \
+_(aten, gcd) \
+_(aten, gcd_) \
+_(aten, ge) \
+_(aten, ge_) \
+_(aten, gelu) \
+_(aten, gelu_) \
+_(aten, gelu_backward) \
+_(aten, geometric) \
+_(aten, geometric_) \
+_(aten, geqrf) \
+_(aten, ger) \
+_(aten, glu) \
+_(aten, glu_backward) \
+_(aten, glu_backward_jvp) \
+_(aten, glu_jvp) \
+_(aten, gradient) \
+_(aten, greater) \
+_(aten, greater_) \
+_(aten, greater_equal) \
+_(aten, greater_equal_) \
+_(aten, grid_sampler) \
+_(aten, grid_sampler_2d) \
+_(aten, grid_sampler_2d_backward) \
+_(aten, grid_sampler_3d) \
+_(aten, grid_sampler_3d_backward) \
+_(aten, group_norm) \
+_(aten, gru) \
+_(aten, gru_cell) \
+_(aten, gt) \
+_(aten, gt_) \
+_(aten, hamming_window) \
+_(aten, hann_window) \
+_(aten, hardshrink) \
+_(aten, hardshrink_backward) \
+_(aten, hardsigmoid) \
+_(aten, hardsigmoid_) \
+_(aten, hardsigmoid_backward) \
+_(aten, hardswish) \
+_(aten, hardswish_) \
+_(aten, hardswish_backward) \
+_(aten, hardtanh) \
+_(aten, hardtanh_) \
+_(aten, hardtanh_backward) \
+_(aten, heaviside) \
+_(aten, heaviside_) \
+_(aten, hinge_embedding_loss) \
+_(aten, histc) \
+_(aten, histogram) \
+_(aten, histogramdd) \
+_(aten, hsplit) \
+_(aten, hspmm) \
+_(aten, hstack) \
+_(aten, huber_loss) \
+_(aten, huber_loss_backward) \
+_(aten, hypot) \
+_(aten, hypot_) \
+_(aten, i0) \
+_(aten, i0_) \
+_(aten, igamma) \
+_(aten, igamma_) \
+_(aten, igammac) \
+_(aten, igammac_) \
+_(aten, im2col) \
+_(aten, imag) \
+_(aten, index) \
+_(aten, index_add) \
+_(aten, index_add_) \
+_(aten, index_copy) \
+_(aten, index_copy_) \
+_(aten, index_fill) \
+_(aten, index_fill_) \
+_(aten, index_put) \
+_(aten, index_put_) \
+_(aten, index_reduce) \
+_(aten, index_reduce_) \
+_(aten, index_select) \
+_(aten, index_select_backward) \
+_(aten, indices) \
+_(aten, indices_copy) \
+_(aten, infinitely_differentiable_gelu_backward) \
+_(aten, inner) \
+_(aten, instance_norm) \
+_(aten, int_repr) \
+_(aten, inverse) \
+_(aten, is_coalesced) \
+_(aten, is_complex) \
+_(aten, is_conj) \
+_(aten, is_distributed) \
+_(aten, is_floating_point) \
+_(aten, is_inference) \
+_(aten, is_leaf) \
+_(aten, is_neg) \
+_(aten, is_nonzero) \
+_(aten, is_pinned) \
+_(aten, is_same_size) \
+_(aten, is_set_to) \
+_(aten, is_signed) \
+_(aten, is_vulkan_available) \
+_(aten, isclose) \
+_(aten, isfinite) \
+_(aten, isin) \
+_(aten, isinf) \
+_(aten, isnan) \
+_(aten, isneginf) \
+_(aten, isposinf) \
+_(aten, isreal) \
+_(aten, istft) \
+_(aten, item) \
+_(aten, kaiser_window) \
+_(aten, kl_div) \
+_(aten, kron) \
+_(aten, kthvalue) \
+_(aten, l1_loss) \
+_(aten, layer_norm) \
+_(aten, lcm) \
+_(aten, lcm_) \
+_(aten, ldexp) \
+_(aten, ldexp_) \
+_(aten, le) \
+_(aten, le_) \
+_(aten, leaky_relu) \
+_(aten, leaky_relu_) \
+_(aten, leaky_relu_backward) \
+_(aten, lerp) \
+_(aten, lerp_) \
+_(aten, less) \
+_(aten, less_) \
+_(aten, less_equal) \
+_(aten, less_equal_) \
+_(aten, lgamma) \
+_(aten, lgamma_) \
+_(aten, lift) \
+_(aten, lift_fresh) \
+_(aten, lift_fresh_copy) \
+_(aten, linalg_cholesky) \
+_(aten, linalg_cholesky_ex) \
+_(aten, linalg_cond) \
+_(aten, linalg_cross) \
+_(aten, linalg_det) \
+_(aten, linalg_diagonal) \
+_(aten, linalg_eig) \
+_(aten, linalg_eigh) \
+_(aten, linalg_eigvals) \
+_(aten, linalg_eigvalsh) \
+_(aten, linalg_householder_product) \
+_(aten, linalg_inv) \
+_(aten, linalg_inv_ex) \
+_(aten, linalg_ldl_factor) \
+_(aten, linalg_ldl_factor_ex) \
+_(aten, linalg_ldl_solve) \
+_(aten, linalg_lstsq) \
+_(aten, linalg_lu) \
+_(aten, linalg_lu_factor) \
+_(aten, linalg_lu_factor_ex) \
+_(aten, linalg_lu_solve) \
+_(aten, linalg_matmul) \
+_(aten, linalg_matrix_exp) \
+_(aten, linalg_matrix_norm) \
+_(aten, linalg_matrix_power) \
+_(aten, linalg_matrix_rank) \
+_(aten, linalg_multi_dot) \
+_(aten, linalg_norm) \
+_(aten, linalg_pinv) \
+_(aten, linalg_qr) \
+_(aten, linalg_slogdet) \
+_(aten, linalg_solve) \
+_(aten, linalg_solve_ex) \
+_(aten, linalg_solve_triangular) \
+_(aten, linalg_svd) \
+_(aten, linalg_svdvals) \
+_(aten, linalg_tensorinv) \
+_(aten, linalg_tensorsolve) \
+_(aten, linalg_vander) \
+_(aten, linalg_vecdot) \
+_(aten, linalg_vector_norm) \
+_(aten, linear) \
+_(aten, linear_backward) \
+_(aten, linspace) \
+_(aten, log) \
+_(aten, log10) \
+_(aten, log10_) \
+_(aten, log1p) \
+_(aten, log1p_) \
+_(aten, log2) \
+_(aten, log2_) \
+_(aten, log_) \
+_(aten, log_normal) \
+_(aten, log_normal_) \
+_(aten, log_sigmoid) \
+_(aten, log_sigmoid_backward) \
+_(aten, log_sigmoid_forward) \
+_(aten, log_softmax) \
+_(aten, logaddexp) \
+_(aten, logaddexp2) \
+_(aten, logcumsumexp) \
+_(aten, logdet) \
+_(aten, logical_and) \
+_(aten, logical_and_) \
+_(aten, logical_not) \
+_(aten, logical_not_) \
+_(aten, logical_or) \
+_(aten, logical_or_) \
+_(aten, logical_xor) \
+_(aten, logical_xor_) \
+_(aten, logit) \
+_(aten, logit_) \
+_(aten, logit_backward) \
+_(aten, logspace) \
+_(aten, logsumexp) \
+_(aten, lshift) \
+_(aten, lstm) \
+_(aten, lstm_cell) \
+_(aten, lstm_mps_backward) \
+_(aten, lt) \
+_(aten, lt_) \
+_(aten, lu_solve) \
+_(aten, lu_unpack) \
+_(aten, mH) \
+_(aten, mT) \
+_(aten, margin_ranking_loss) \
+_(aten, masked_fill) \
+_(aten, masked_fill_) \
+_(aten, masked_scatter) \
+_(aten, masked_scatter_) \
+_(aten, masked_scatter_backward) \
+_(aten, masked_select) \
+_(aten, masked_select_backward) \
+_(aten, matmul) \
+_(aten, matmul_backward) \
+_(aten, matrix_H) \
+_(aten, matrix_exp) \
+_(aten, matrix_exp_backward) \
+_(aten, matrix_power) \
+_(aten, max) \
+_(aten, max_pool1d) \
+_(aten, max_pool1d_with_indices) \
+_(aten, max_pool2d) \
+_(aten, max_pool2d_backward) \
+_(aten, max_pool2d_with_indices) \
+_(aten, max_pool2d_with_indices_backward) \
+_(aten, max_pool3d) \
+_(aten, max_pool3d_with_indices) \
+_(aten, max_pool3d_with_indices_backward) \
+_(aten, max_unpool2d) \
+_(aten, max_unpool3d) \
+_(aten, maximum) \
+_(aten, mean) \
+_(aten, median) \
+_(aten, meshgrid) \
+_(aten, min) \
+_(aten, minimum) \
+_(aten, miopen_batch_norm) \
+_(aten, miopen_batch_norm_backward) \
+_(aten, miopen_convolution) \
+_(aten, miopen_convolution_add_relu) \
+_(aten, miopen_convolution_relu) \
+_(aten, miopen_convolution_transpose) \
+_(aten, miopen_depthwise_convolution) \
+_(aten, miopen_rnn) \
+_(aten, miopen_rnn_backward) \
+_(aten, mish) \
+_(aten, mish_) \
+_(aten, mish_backward) \
+_(aten, mkldnn_adaptive_avg_pool2d) \
+_(aten, mkldnn_adaptive_avg_pool2d_backward) \
+_(aten, mkldnn_convolution) \
+_(aten, mkldnn_linear) \
+_(aten, mkldnn_linear_backward) \
+_(aten, mkldnn_linear_backward_input) \
+_(aten, mkldnn_linear_backward_weights) \
+_(aten, mkldnn_max_pool2d) \
+_(aten, mkldnn_max_pool2d_backward) \
+_(aten, mkldnn_max_pool3d) \
+_(aten, mkldnn_max_pool3d_backward) \
+_(aten, mkldnn_reorder_conv2d_weight) \
+_(aten, mkldnn_reorder_conv3d_weight) \
+_(aten, mkldnn_rnn_layer) \
+_(aten, mkldnn_rnn_layer_backward) \
+_(aten, mm) \
+_(aten, mode) \
+_(aten, moveaxis) \
+_(aten, movedim) \
+_(aten, mps_convolution_backward) \
+_(aten, mps_convolution_transpose_backward) \
+_(aten, mse_loss) \
+_(aten, mse_loss_backward) \
+_(aten, msort) \
+_(aten, mul) \
+_(aten, mul_) \
+_(aten, multi_margin_loss) \
+_(aten, multi_margin_loss_backward) \
+_(aten, multilabel_margin_loss) \
+_(aten, multilabel_margin_loss_backward) \
+_(aten, multilabel_margin_loss_forward) \
+_(aten, multinomial) \
+_(aten, multiply) \
+_(aten, multiply_) \
+_(aten, mv) \
+_(aten, mvlgamma) \
+_(aten, mvlgamma_) \
+_(aten, nan_to_num) \
+_(aten, nan_to_num_) \
+_(aten, nanmean) \
+_(aten, nanmedian) \
+_(aten, nanquantile) \
+_(aten, nansum) \
+_(aten, narrow) \
+_(aten, narrow_copy) \
+_(aten, native_batch_norm) \
+_(aten, native_batch_norm_backward) \
+_(aten, native_channel_shuffle) \
+_(aten, native_dropout) \
+_(aten, native_dropout_backward) \
+_(aten, native_group_norm) \
+_(aten, native_group_norm_backward) \
+_(aten, native_layer_norm) \
+_(aten, native_layer_norm_backward) \
+_(aten, native_norm) \
+_(aten, ne) \
+_(aten, ne_) \
+_(aten, neg) \
+_(aten, neg_) \
+_(aten, negative) \
+_(aten, negative_) \
+_(aten, nested_to_padded_tensor) \
+_(aten, new_empty) \
+_(aten, new_empty_strided) \
+_(aten, new_full) \
+_(aten, new_ones) \
+_(aten, new_zeros) \
+_(aten, nextafter) \
+_(aten, nextafter_) \
+_(aten, nll_loss) \
+_(aten, nll_loss2d) \
+_(aten, nll_loss2d_backward) \
+_(aten, nll_loss2d_forward) \
+_(aten, nll_loss_backward) \
+_(aten, nll_loss_forward) \
+_(aten, nll_loss_nd) \
+_(aten, nonzero) \
+_(aten, nonzero_numpy) \
+_(aten, nonzero_static) \
+_(aten, norm) \
+_(aten, norm_except_dim) \
+_(aten, normal) \
+_(aten, normal_) \
+_(aten, normal_functional) \
+_(aten, not_equal) \
+_(aten, not_equal_) \
+_(aten, nuclear_norm) \
+_(aten, numpy_T) \
+_(aten, one_hot) \
+_(aten, ones) \
+_(aten, ones_like) \
+_(aten, orgqr) \
+_(aten, ormqr) \
+_(aten, outer) \
+_(aten, output_nr) \
+_(aten, pad) \
+_(aten, pad_sequence) \
+_(aten, pairwise_distance) \
+_(aten, pdist) \
+_(aten, permute) \
+_(aten, permute_copy) \
+_(aten, pin_memory) \
+_(aten, pinverse) \
+_(aten, pixel_shuffle) \
+_(aten, pixel_unshuffle) \
+_(aten, poisson) \
+_(aten, poisson_nll_loss) \
+_(aten, polar) \
+_(aten, polygamma) \
+_(aten, polygamma_) \
+_(aten, positive) \
+_(aten, pow) \
+_(aten, pow_) \
+_(aten, prelu) \
+_(aten, prod) \
+_(aten, promote_types) \
+_(aten, put) \
+_(aten, put_) \
+_(aten, q_per_channel_axis) \
+_(aten, q_per_channel_scales) \
+_(aten, q_per_channel_zero_points) \
+_(aten, q_scale) \
+_(aten, q_zero_point) \
+_(aten, qr) \
+_(aten, qscheme) \
+_(aten, quantile) \
+_(aten, quantize_per_channel) \
+_(aten, quantize_per_tensor) \
+_(aten, quantize_per_tensor_dynamic) \
+_(aten, quantized_batch_norm) \
+_(aten, quantized_gru_cell) \
+_(aten, quantized_lstm_cell) \
+_(aten, quantized_max_pool1d) \
+_(aten, quantized_max_pool2d) \
+_(aten, quantized_max_pool3d) \
+_(aten, quantized_rnn_relu_cell) \
+_(aten, quantized_rnn_tanh_cell) \
+_(aten, rad2deg) \
+_(aten, rad2deg_) \
+_(aten, rand) \
+_(aten, rand_like) \
+_(aten, randint) \
+_(aten, randint_like) \
+_(aten, randn) \
+_(aten, randn_like) \
+_(aten, random) \
+_(aten, random_) \
+_(aten, randperm) \
+_(aten, range) \
+_(aten, ravel) \
+_(aten, real) \
+_(aten, reciprocal) \
+_(aten, reciprocal_) \
+_(aten, record_stream) \
+_(aten, refine_names) \
+_(aten, reflection_pad1d) \
+_(aten, reflection_pad1d_backward) \
+_(aten, reflection_pad2d) \
+_(aten, reflection_pad2d_backward) \
+_(aten, reflection_pad3d) \
+_(aten, reflection_pad3d_backward) \
+_(aten, relu) \
+_(aten, relu6) \
+_(aten, relu6_) \
+_(aten, relu_) \
+_(aten, remainder) \
+_(aten, remainder_) \
+_(aten, rename) \
+_(aten, rename_) \
+_(aten, renorm) \
+_(aten, renorm_) \
+_(aten, repeat) \
+_(aten, repeat_interleave) \
+_(aten, replication_pad1d) \
+_(aten, replication_pad1d_backward) \
+_(aten, replication_pad2d) \
+_(aten, replication_pad2d_backward) \
+_(aten, replication_pad3d) \
+_(aten, replication_pad3d_backward) \
+_(aten, requires_grad) \
+_(aten, requires_grad_) \
+_(aten, reshape) \
+_(aten, reshape_as) \
+_(aten, resize) \
+_(aten, resize_) \
+_(aten, resize_as) \
+_(aten, resize_as_) \
+_(aten, resize_as_sparse) \
+_(aten, resize_as_sparse_) \
+_(aten, resolve_conj) \
+_(aten, resolve_neg) \
+_(aten, result_type) \
+_(aten, retain_grad) \
+_(aten, retains_grad) \
+_(aten, rms_norm) \
+_(aten, rnn_relu) \
+_(aten, rnn_relu_cell) \
+_(aten, rnn_tanh) \
+_(aten, rnn_tanh_cell) \
+_(aten, roll) \
+_(aten, rot90) \
+_(aten, round) \
+_(aten, round_) \
+_(aten, row_indices) \
+_(aten, row_indices_copy) \
+_(aten, row_stack) \
+_(aten, rrelu) \
+_(aten, rrelu_) \
+_(aten, rrelu_with_noise) \
+_(aten, rrelu_with_noise_) \
+_(aten, rrelu_with_noise_backward) \
+_(aten, rshift) \
+_(aten, rsqrt) \
+_(aten, rsqrt_) \
+_(aten, rsub) \
+_(aten, scalar_tensor) \
+_(aten, scaled_dot_product_attention) \
+_(aten, scatter) \
+_(aten, scatter_) \
+_(aten, scatter_add) \
+_(aten, scatter_add_) \
+_(aten, scatter_reduce) \
+_(aten, scatter_reduce_) \
+_(aten, searchsorted) \
+_(aten, segment_reduce) \
+_(aten, select) \
+_(aten, select_backward) \
+_(aten, select_copy) \
+_(aten, select_scatter) \
+_(aten, selu) \
+_(aten, selu_) \
+_(aten, set) \
+_(aten, set_) \
+_(aten, set_data) \
+_(aten, sgn) \
+_(aten, sgn_) \
+_(aten, sigmoid) \
+_(aten, sigmoid_) \
+_(aten, sigmoid_backward) \
+_(aten, sign) \
+_(aten, sign_) \
+_(aten, signbit) \
+_(aten, silu) \
+_(aten, silu_) \
+_(aten, silu_backward) \
+_(aten, sin) \
+_(aten, sin_) \
+_(aten, sinc) \
+_(aten, sinc_) \
+_(aten, sinh) \
+_(aten, sinh_) \
+_(aten, size) \
+_(aten, slice) \
+_(aten, slice_backward) \
+_(aten, slice_copy) \
+_(aten, slice_inverse) \
+_(aten, slice_scatter) \
+_(aten, slogdet) \
+_(aten, slow_conv3d) \
+_(aten, slow_conv3d_forward) \
+_(aten, slow_conv_dilated2d) \
+_(aten, slow_conv_dilated3d) \
+_(aten, slow_conv_transpose2d) \
+_(aten, slow_conv_transpose3d) \
+_(aten, smm) \
+_(aten, smooth_l1_loss) \
+_(aten, smooth_l1_loss_backward) \
+_(aten, soft_margin_loss) \
+_(aten, soft_margin_loss_backward) \
+_(aten, softmax) \
+_(aten, softplus) \
+_(aten, softplus_backward) \
+_(aten, softshrink) \
+_(aten, softshrink_backward) \
+_(aten, sort) \
+_(aten, sparse_bsc_tensor) \
+_(aten, sparse_bsr_tensor) \
+_(aten, sparse_compressed_tensor) \
+_(aten, sparse_coo_tensor) \
+_(aten, sparse_csc_tensor) \
+_(aten, sparse_csr_tensor) \
+_(aten, sparse_dim) \
+_(aten, sparse_mask) \
+_(aten, sparse_resize) \
+_(aten, sparse_resize_) \
+_(aten, sparse_resize_and_clear) \
+_(aten, sparse_resize_and_clear_) \
+_(aten, sparse_sampled_addmm) \
+_(aten, special_airy_ai) \
+_(aten, special_bessel_j0) \
+_(aten, special_bessel_j1) \
+_(aten, special_bessel_y0) \
+_(aten, special_bessel_y1) \
+_(aten, special_chebyshev_polynomial_t) \
+_(aten, special_chebyshev_polynomial_u) \
+_(aten, special_chebyshev_polynomial_v) \
+_(aten, special_chebyshev_polynomial_w) \
+_(aten, special_digamma) \
+_(aten, special_entr) \
+_(aten, special_erf) \
+_(aten, special_erfc) \
+_(aten, special_erfcx) \
+_(aten, special_erfinv) \
+_(aten, special_exp2) \
+_(aten, special_expit) \
+_(aten, special_expm1) \
+_(aten, special_gammainc) \
+_(aten, special_gammaincc) \
+_(aten, special_gammaln) \
+_(aten, special_hermite_polynomial_h) \
+_(aten, special_hermite_polynomial_he) \
+_(aten, special_i0) \
+_(aten, special_i0e) \
+_(aten, special_i1) \
+_(aten, special_i1e) \
+_(aten, special_laguerre_polynomial_l) \
+_(aten, special_legendre_polynomial_p) \
+_(aten, special_log1p) \
+_(aten, special_log_ndtr) \
+_(aten, special_log_softmax) \
+_(aten, special_logit) \
+_(aten, special_logsumexp) \
+_(aten, special_modified_bessel_i0) \
+_(aten, special_modified_bessel_i1) \
+_(aten, special_modified_bessel_k0) \
+_(aten, special_modified_bessel_k1) \
+_(aten, special_multigammaln) \
+_(aten, special_ndtr) \
+_(aten, special_ndtri) \
+_(aten, special_polygamma) \
+_(aten, special_psi) \
+_(aten, special_round) \
+_(aten, special_scaled_modified_bessel_k0) \
+_(aten, special_scaled_modified_bessel_k1) \
+_(aten, special_shifted_chebyshev_polynomial_t) \
+_(aten, special_shifted_chebyshev_polynomial_u) \
+_(aten, special_shifted_chebyshev_polynomial_v) \
+_(aten, special_shifted_chebyshev_polynomial_w) \
+_(aten, special_sinc) \
+_(aten, special_softmax) \
+_(aten, special_spherical_bessel_j0) \
+_(aten, special_xlog1py) \
+_(aten, special_xlogy) \
+_(aten, special_zeta) \
+_(aten, split) \
+_(aten, split_copy) \
+_(aten, split_with_sizes) \
+_(aten, split_with_sizes_copy) \
+_(aten, sqrt) \
+_(aten, sqrt_) \
+_(aten, square) \
+_(aten, square_) \
+_(aten, squeeze) \
+_(aten, squeeze_) \
+_(aten, squeeze_copy) \
+_(aten, sspaddmm) \
+_(aten, stack) \
+_(aten, std) \
+_(aten, std_mean) \
+_(aten, stft) \
+_(aten, stride) \
+_(aten, sub) \
+_(aten, sub_) \
+_(aten, subtract) \
+_(aten, subtract_) \
+_(aten, sum) \
+_(aten, sum_to_size) \
+_(aten, svd) \
+_(aten, swapaxes) \
+_(aten, swapaxes_) \
+_(aten, swapdims) \
+_(aten, swapdims_) \
+_(aten, sym_constrain_range) \
+_(aten, sym_constrain_range_for_size) \
+_(aten, sym_numel) \
+_(aten, sym_size) \
+_(aten, sym_storage_offset) \
+_(aten, sym_stride) \
+_(aten, t) \
+_(aten, t_) \
+_(aten, t_copy) \
+_(aten, take) \
+_(aten, take_along_dim) \
+_(aten, tan) \
+_(aten, tan_) \
+_(aten, tanh) \
+_(aten, tanh_) \
+_(aten, tanh_backward) \
+_(aten, tensor_split) \
+_(aten, tensordot) \
+_(aten, thnn_conv2d) \
+_(aten, threshold) \
+_(aten, threshold_) \
+_(aten, threshold_backward) \
+_(aten, tile) \
+_(aten, to) \
+_(aten, to_dense) \
+_(aten, to_dense_backward) \
+_(aten, to_mkldnn) \
+_(aten, to_mkldnn_backward) \
+_(aten, to_padded_tensor) \
+_(aten, to_sparse) \
+_(aten, to_sparse_bsc) \
+_(aten, to_sparse_bsr) \
+_(aten, to_sparse_csc) \
+_(aten, to_sparse_csr) \
+_(aten, topk) \
+_(aten, trace) \
+_(aten, trace_backward) \
+_(aten, transpose) \
+_(aten, transpose_) \
+_(aten, transpose_copy) \
+_(aten, trapezoid) \
+_(aten, trapz) \
+_(aten, triangular_solve) \
+_(aten, tril) \
+_(aten, tril_) \
+_(aten, tril_indices) \
+_(aten, triplet_margin_loss) \
+_(aten, triu) \
+_(aten, triu_) \
+_(aten, triu_indices) \
+_(aten, true_divide) \
+_(aten, true_divide_) \
+_(aten, trunc) \
+_(aten, trunc_) \
+_(aten, type_as) \
+_(aten, unbind) \
+_(aten, unbind_copy) \
+_(aten, unflatten) \
+_(aten, unflatten_dense_tensors) \
+_(aten, unfold) \
+_(aten, unfold_backward) \
+_(aten, unfold_copy) \
+_(aten, uniform) \
+_(aten, uniform_) \
+_(aten, unique_consecutive) \
+_(aten, unique_dim) \
+_(aten, unique_dim_consecutive) \
+_(aten, unsafe_chunk) \
+_(aten, unsafe_split) \
+_(aten, unsafe_split_with_sizes) \
+_(aten, unsqueeze) \
+_(aten, unsqueeze_) \
+_(aten, unsqueeze_copy) \
+_(aten, upsample_bicubic2d) \
+_(aten, upsample_bicubic2d_backward) \
+_(aten, upsample_bilinear2d) \
+_(aten, upsample_bilinear2d_backward) \
+_(aten, upsample_linear1d) \
+_(aten, upsample_linear1d_backward) \
+_(aten, upsample_nearest1d) \
+_(aten, upsample_nearest1d_backward) \
+_(aten, upsample_nearest2d) \
+_(aten, upsample_nearest2d_backward) \
+_(aten, upsample_nearest3d) \
+_(aten, upsample_nearest3d_backward) \
+_(aten, upsample_trilinear3d) \
+_(aten, upsample_trilinear3d_backward) \
+_(aten, value_selecting_reduction_backward) \
+_(aten, values) \
+_(aten, values_copy) \
+_(aten, vander) \
+_(aten, var) \
+_(aten, var_mean) \
+_(aten, vdot) \
+_(aten, view) \
+_(aten, view_as) \
+_(aten, view_as_complex) \
+_(aten, view_as_complex_copy) \
+_(aten, view_as_real) \
+_(aten, view_as_real_copy) \
+_(aten, view_copy) \
+_(aten, vsplit) \
+_(aten, vstack) \
+_(aten, where) \
+_(aten, xlogy) \
+_(aten, xlogy_) \
+_(aten, zero) \
+_(aten, zero_) \
+_(aten, zeros) \
+_(aten, zeros_like)
+
+#define FORALL_ATTR_BASE_SYMBOLS(_) \
+_(attr, A) \
+_(attr, B) \
+_(attr, C) \
+_(attr, H) \
+_(attr, HxW) \
+_(attr, K) \
+_(attr, L) \
+_(attr, LD) \
+_(attr, LU) \
+_(attr, LU_data) \
+_(attr, LU_pivots) \
+_(attr, M) \
+_(attr, N) \
+_(attr, P) \
+_(attr, Q) \
+_(attr, R) \
+_(attr, S) \
+_(attr, U) \
+_(attr, UPLO) \
+_(attr, V) \
+_(attr, Vh) \
+_(attr, W) \
+_(attr, X) \
+_(attr, a) \
+_(attr, abs) \
+_(attr, accumulate) \
+_(attr, accumulate_matches) \
+_(attr, activation) \
+_(attr, addends) \
+_(attr, adjoint) \
+_(attr, alg_id) \
+_(attr, algorithm) \
+_(attr, alibi_slopes) \
+_(attr, align_corners) \
+_(attr, allow_tf32) \
+_(attr, alpha) \
+_(attr, amsgrad) \
+_(attr, anchor) \
+_(attr, angle) \
+_(attr, any) \
+_(attr, api_name) \
+_(attr, append) \
+_(attr, approximate) \
+_(attr, arg1) \
+_(attr, arg2) \
+_(attr, arg3) \
+_(attr, arg_out) \
+_(attr, assert_msg) \
+_(attr, assume_unique) \
+_(attr, atol) \
+_(attr, attn_bias) \
+_(attr, attn_mask) \
+_(attr, average_attn_weights) \
+_(attr, averaging_const) \
+_(attr, aweights) \
+_(attr, axis) \
+_(attr, axis0) \
+_(attr, axis1) \
+_(attr, b) \
+_(attr, b_hh) \
+_(attr, b_ih) \
+_(attr, bag_size) \
+_(attr, base) \
+_(attr, batch1) \
+_(attr, batch2) \
+_(attr, batch_dim) \
+_(attr, batch_first) \
+_(attr, batch_size) \
+_(attr, batch_sizes) \
+_(attr, benchmark) \
+_(attr, beta) \
+_(attr, beta1) \
+_(attr, beta2) \
+_(attr, bias) \
+_(attr, bias_defined) \
+_(attr, bias_g) \
+_(attr, bias_requires_grad) \
+_(attr, bias_sizes) \
+_(attr, bidirectional) \
+_(attr, bin_edges) \
+_(attr, bins) \
+_(attr, bit_width) \
+_(attr, blank) \
+_(attr, blocksize) \
+_(attr, boundaries) \
+_(attr, buffer) \
+_(attr, ccol_indices) \
+_(attr, cdim) \
+_(attr, cdist) \
+_(attr, ceil_mode) \
+_(attr, cell_state_fwd) \
+_(attr, center) \
+_(attr, ch_axis) \
+_(attr, check_errors) \
+_(attr, chunks) \
+_(attr, coalesced) \
+_(attr, coefficients) \
+_(attr, col) \
+_(attr, col_indices) \
+_(attr, col_offsets) \
+_(attr, col_offsets_hh) \
+_(attr, col_offsets_ih) \
+_(attr, compressed_A) \
+_(attr, compressed_idx) \
+_(attr, compressed_indices) \
+_(attr, compressed_indices_dtype) \
+_(attr, compute_log_sumexp) \
+_(attr, compute_mode) \
+_(attr, compute_uv) \
+_(attr, compute_v) \
+_(attr, condition) \
+_(attr, copy) \
+_(attr, correction) \
+_(attr, count) \
+_(attr, count_include_pad) \
+_(attr, counts) \
+_(attr, cpu_dtype) \
+_(attr, cpu_enabled) \
+_(attr, cpu_nested_shape_example) \
+_(attr, create_graph) \
+_(attr, crow_indices) \
+_(attr, cu_seqlens_k) \
+_(attr, cu_seqlens_q) \
+_(attr, cuda_dtype) \
+_(attr, cuda_enabled) \
+_(attr, cudnn_enable) \
+_(attr, cudnn_enabled) \
+_(attr, cum_seq_k) \
+_(attr, cum_seq_q) \
+_(attr, custom_mask_type) \
+_(attr, cx) \
+_(attr, cx_) \
+_(attr, cx_tmp) \
+_(attr, cy) \
+_(attr, cy_) \
+_(attr, d) \
+_(attr, dampening) \
+_(attr, data) \
+_(attr, decimals) \
+_(attr, delta) \
+_(attr, dense) \
+_(attr, dense_B) \
+_(attr, dense_dim) \
+_(attr, density) \
+_(attr, dep_token) \
+_(attr, descending) \
+_(attr, destination) \
+_(attr, deterministic) \
+_(attr, device) \
+_(attr, device_index) \
+_(attr, dgrad_glu) \
+_(attr, diagonal) \
+_(attr, diagonals) \
+_(attr, dilation) \
+_(attr, dim) \
+_(attr, dim0) \
+_(attr, dim1) \
+_(attr, dim2) \
+_(attr, dimension) \
+_(attr, dims) \
+_(attr, dims_other) \
+_(attr, dims_self) \
+_(attr, divisor_override) \
+_(attr, downscale_factor) \
+_(attr, driver) \
+_(attr, dropout) \
+_(attr, dropout_mask) \
+_(attr, dropout_p) \
+_(attr, dropout_seed) \
+_(attr, dropout_state) \
+_(attr, dst) \
+_(attr, dtype) \
+_(attr, dual) \
+_(attr, dummy) \
+_(attr, dx) \
+_(attr, edge_order) \
+_(attr, eigenvalues) \
+_(attr, eigenvectors) \
+_(attr, eigvals) \
+_(attr, eigvecs) \
+_(attr, element) \
+_(attr, elements) \
+_(attr, ellipsis_idx) \
+_(attr, embed_dim) \
+_(attr, enable_gqa) \
+_(attr, end) \
+_(attr, end_dim) \
+_(attr, eps) \
+_(attr, epsilon) \
+_(attr, equal_nan) \
+_(attr, equation) \
+_(attr, exp_avg_sqs) \
+_(attr, exp_avgs) \
+_(attr, expand1) \
+_(attr, expand2) \
+_(attr, expand3) \
+_(attr, exponent) \
+_(attr, exponential_average_factor) \
+_(attr, fake_quant_enabled) \
+_(attr, fake_quant_on) \
+_(attr, ffn_bias_1) \
+_(attr, ffn_bias_2) \
+_(attr, ffn_weight_1) \
+_(attr, ffn_weight_2) \
+_(attr, filename) \
+_(attr, fill) \
+_(attr, fill_value) \
+_(attr, flat) \
+_(attr, forward) \
+_(attr, found_inf) \
+_(attr, from) \
+_(attr, from_) \
+_(attr, full) \
+_(attr, full_matrices) \
+_(attr, fuse_transform_0213) \
+_(attr, fweights) \
+_(attr, g) \
+_(attr, gO) \
+_(attr, generator) \
+_(attr, ggI) \
+_(attr, ggW) \
+_(attr, ggb) \
+_(attr, glu) \
+_(attr, grad) \
+_(attr, grad_bias) \
+_(attr, grad_cy) \
+_(attr, grad_factor) \
+_(attr, grad_glu) \
+_(attr, grad_hy) \
+_(attr, grad_in) \
+_(attr, grad_input) \
+_(attr, grad_input_mask) \
+_(attr, grad_out) \
+_(attr, grad_out_) \
+_(attr, grad_output) \
+_(attr, grad_scale) \
+_(attr, grad_w) \
+_(attr, grad_weight) \
+_(attr, grad_x) \
+_(attr, grad_y) \
+_(attr, gradient) \
+_(attr, grads) \
+_(attr, grid) \
+_(attr, group) \
+_(attr, groups) \
+_(attr, growth_interval) \
+_(attr, growth_tracker) \
+_(attr, half_to_float) \
+_(attr, has_bias) \
+_(attr, has_biases) \
+_(attr, hermitian) \
+_(attr, hidden_bias) \
+_(attr, hidden_gates) \
+_(attr, hidden_size) \
+_(attr, high) \
+_(attr, hist) \
+_(attr, hop_length) \
+_(attr, hx) \
+_(attr, hx_) \
+_(attr, hy_) \
+_(attr, i1) \
+_(attr, i2) \
+_(attr, i3) \
+_(attr, ignore_index) \
+_(attr, imag) \
+_(attr, impl_index) \
+_(attr, implicit) \
+_(attr, include_last_offset) \
+_(attr, include_self) \
+_(attr, increasing) \
+_(attr, ind) \
+_(attr, index) \
+_(attr, index_dtype) \
+_(attr, indexing) \
+_(attr, indices) \
+_(attr, info) \
+_(attr, initial) \
+_(attr, innerKTiles) \
+_(attr, input) \
+_(attr, input1) \
+_(attr, input2) \
+_(attr, input3) \
+_(attr, input_bias) \
+_(attr, input_dtype) \
+_(attr, input_g) \
+_(attr, input_gates) \
+_(attr, input_lengths) \
+_(attr, input_scale) \
+_(attr, input_size) \
+_(attr, input_sizes) \
+_(attr, input_zero_point) \
+_(attr, inputs) \
+_(attr, interpolation) \
+_(attr, interpolation_mode) \
+_(attr, inv_scale) \
+_(attr, inverse) \
+_(attr, invert) \
+_(attr, invstd) \
+_(attr, is_causal) \
+_(attr, is_coalesced) \
+_(attr, is_crow) \
+_(attr, is_first_step) \
+_(attr, is_matrix) \
+_(attr, is_result) \
+_(attr, is_target) \
+_(attr, k) \
+_(attr, keepdim) \
+_(attr, kernel_size) \
+_(attr, key) \
+_(attr, label_smoothing) \
+_(attr, lambd) \
+_(attr, largest) \
+_(attr, last_dim_size) \
+_(attr, layersOutputs) \
+_(attr, layout) \
+_(attr, left) \
+_(attr, length) \
+_(attr, lengths) \
+_(attr, level) \
+_(attr, like) \
+_(attr, list) \
+_(attr, log_alpha) \
+_(attr, log_input) \
+_(attr, log_probs) \
+_(attr, log_target) \
+_(attr, logabsdet) \
+_(attr, logsumexp) \
+_(attr, low) \
+_(attr, lower) \
+_(attr, lr) \
+_(attr, lr_decay) \
+_(attr, ltm) \
+_(attr, m) \
+_(attr, mantissa) \
+_(attr, margin) \
+_(attr, mask) \
+_(attr, mask_check) \
+_(attr, mask_type) \
+_(attr, masked_grad) \
+_(attr, mat) \
+_(attr, mat1) \
+_(attr, mat1_meta) \
+_(attr, mat2) \
+_(attr, matrices) \
+_(attr, max) \
+_(attr, max_exp_avg_sqs) \
+_(attr, max_k) \
+_(attr, max_lengths) \
+_(attr, max_norm) \
+_(attr, max_q) \
+_(attr, max_seqlen) \
+_(attr, max_seqlen_k) \
+_(attr, max_seqlen_q) \
+_(attr, max_size) \
+_(attr, max_val) \
+_(attr, max_values) \
+_(attr, maximize) \
+_(attr, maximum_indices) \
+_(attr, maxnorm) \
+_(attr, mean) \
+_(attr, median) \
+_(attr, memory_format) \
+_(attr, meta) \
+_(attr, min) \
+_(attr, min_indices) \
+_(attr, min_seqlen) \
+_(attr, min_val) \
+_(attr, minlength) \
+_(attr, mode) \
+_(attr, momentum) \
+_(attr, momentum_buffer_list) \
+_(attr, n) \
+_(attr, n_bins) \
+_(attr, n_fft) \
+_(attr, names) \
+_(attr, nan) \
+_(attr, need_weights) \
+_(attr, neg_log_likelihood) \
+_(attr, negative) \
+_(attr, negative_slope) \
+_(attr, neginf) \
+_(attr, nested_size) \
+_(attr, nested_strides) \
+_(attr, nesterov) \
+_(attr, new_data) \
+_(attr, nnz) \
+_(attr, noise) \
+_(attr, non_blocking) \
+_(attr, norm) \
+_(attr, norm_bias_1) \
+_(attr, norm_bias_2) \
+_(attr, norm_first) \
+_(attr, norm_type) \
+_(attr, norm_weight_1) \
+_(attr, norm_weight_2) \
+_(attr, normalization) \
+_(attr, normalized) \
+_(attr, normalized_shape) \
+_(attr, nt_example) \
+_(attr, num_chunks) \
+_(attr, num_classes) \
+_(attr, num_generated) \
+_(attr, num_groups) \
+_(attr, num_head) \
+_(attr, num_heads) \
+_(attr, num_layers) \
+_(attr, num_parallel) \
+_(attr, num_samples) \
+_(attr, num_splits_key) \
+_(attr, num_weights) \
+_(attr, numel) \
+_(attr, observer_on) \
+_(attr, offset) \
+_(attr, offset2bag) \
+_(attr, offsets) \
+_(attr, onesided) \
+_(attr, ord) \
+_(attr, order) \
+_(attr, other) \
+_(attr, out) \
+_(attr, out0) \
+_(attr, out1) \
+_(attr, out2) \
+_(attr, out3) \
+_(attr, out4) \
+_(attr, out5) \
+_(attr, out6) \
+_(attr, out_channel) \
+_(attr, out_dim) \
+_(attr, out_dtype) \
+_(attr, out_int32) \
+_(attr, outdim) \
+_(attr, output) \
+_(attr, output_mask) \
+_(attr, output_padding) \
+_(attr, output_scale) \
+_(attr, output_size) \
+_(attr, output_zero_point) \
+_(attr, p) \
+_(attr, packed) \
+_(attr, packed_hh) \
+_(attr, packed_ih) \
+_(attr, packed_weight) \
+_(attr, pad) \
+_(attr, pad_mode) \
+_(attr, padded) \
+_(attr, padding) \
+_(attr, padding_idx) \
+_(attr, padding_mode) \
+_(attr, padding_side) \
+_(attr, padding_value) \
+_(attr, params) \
+_(attr, path) \
+_(attr, pdist) \
+_(attr, per_row_fake_quant) \
+_(attr, per_sample_weights) \
+_(attr, periodic) \
+_(attr, philox_offset) \
+_(attr, philox_seed) \
+_(attr, physical_layout) \
+_(attr, pin_memory) \
+_(attr, pivot) \
+_(attr, pivots) \
+_(attr, plain_idx) \
+_(attr, plain_indices) \
+_(attr, pos_weight) \
+_(attr, posinf) \
+_(attr, positive) \
+_(attr, pow) \
+_(attr, prepend) \
+_(attr, primal) \
+_(attr, prob) \
+_(attr, proj_bias) \
+_(attr, proj_size) \
+_(attr, proj_weight) \
+_(attr, q) \
+_(attr, qGroupSize) \
+_(attr, qScaleAndZeros) \
+_(attr, qkv) \
+_(attr, qkv_bias) \
+_(attr, qkv_weight) \
+_(attr, qtensor) \
+_(attr, quant_max) \
+_(attr, quant_min) \
+_(attr, quasi) \
+_(attr, query) \
+_(attr, r) \
+_(attr, ragged_idx) \
+_(attr, random_samples) \
+_(attr, range) \
+_(attr, rank) \
+_(attr, ratio) \
+_(attr, rcond) \
+_(attr, real) \
+_(attr, reduce) \
+_(attr, reduce_range) \
+_(attr, reduction) \
+_(attr, repeats) \
+_(attr, replacement) \
+_(attr, requires_grad) \
+_(attr, reserve) \
+_(attr, reserveSpace) \
+_(attr, reservedSpace) \
+_(attr, residuals) \
+_(attr, result) \
+_(attr, retain_graph) \
+_(attr, return_complex) \
+_(attr, return_counts) \
+_(attr, return_debug_mask) \
+_(attr, return_inverse) \
+_(attr, reverse) \
+_(attr, right) \
+_(attr, rounding_mode) \
+_(attr, row) \
+_(attr, row_indices) \
+_(attr, rstd) \
+_(attr, rtol) \
+_(attr, running_max) \
+_(attr, running_mean) \
+_(attr, running_min) \
+_(attr, running_var) \
+_(attr, s) \
+_(attr, save_invstd) \
+_(attr, save_mean) \
+_(attr, save_var) \
+_(attr, save_var_transform) \
+_(attr, saved_g) \
+_(attr, saved_norms) \
+_(attr, saved_v) \
+_(attr, scalar) \
+_(attr, scalar1) \
+_(attr, scalar2) \
+_(attr, scalars) \
+_(attr, scale) \
+_(attr, scale_a) \
+_(attr, scale_b) \
+_(attr, scale_backoff_factor) \
+_(attr, scale_factors) \
+_(attr, scale_grad_by_freq) \
+_(attr, scale_growth_factor) \
+_(attr, scale_hh) \
+_(attr, scale_ih) \
+_(attr, scale_result) \
+_(attr, scales) \
+_(attr, scales_d) \
+_(attr, scales_h) \
+_(attr, scales_w) \
+_(attr, sections) \
+_(attr, seed) \
+_(attr, self) \
+_(attr, self_is_result) \
+_(attr, self_num_batch_dims) \
+_(attr, self_or_result) \
+_(attr, self_sizes) \
+_(attr, seqlen_k) \
+_(attr, sequences) \
+_(attr, seqused_k) \
+_(attr, shape) \
+_(attr, shared) \
+_(attr, shared_storage_dqdkdv) \
+_(attr, shifts) \
+_(attr, side) \
+_(attr, sigma) \
+_(attr, sign) \
+_(attr, singular_values) \
+_(attr, size) \
+_(attr, sizes) \
+_(attr, skip_first) \
+_(attr, sobolstate) \
+_(attr, solution) \
+_(attr, some) \
+_(attr, sorted) \
+_(attr, sorted_sequence) \
+_(attr, sorter) \
+_(attr, source) \
+_(attr, spacing) \
+_(attr, sparse) \
+_(attr, sparse_dim) \
+_(attr, sparse_grad) \
+_(attr, split_size) \
+_(attr, split_sizes) \
+_(attr, src) \
+_(attr, stable) \
+_(attr, start) \
+_(attr, start_dim) \
+_(attr, state_steps) \
+_(attr, state_sums) \
+_(attr, std) \
+_(attr, step) \
+_(attr, steps) \
+_(attr, storage_offset) \
+_(attr, stride) \
+_(attr, sum_dy) \
+_(attr, sum_dy_xmu) \
+_(attr, sumdim) \
+_(attr, swap) \
+_(attr, symmetric_quant) \
+_(attr, t) \
+_(attr, tangent) \
+_(attr, target) \
+_(attr, target_lengths) \
+_(attr, targets) \
+_(attr, tau) \
+_(attr, tensor) \
+_(attr, tensor1) \
+_(attr, tensor2) \
+_(attr, tensor_indices_or_sections) \
+_(attr, tensors) \
+_(attr, tensors1) \
+_(attr, test_element) \
+_(attr, test_elements) \
+_(attr, the_template) \
+_(attr, theta) \
+_(attr, thread_masks) \
+_(attr, threshold) \
+_(attr, to) \
+_(attr, tol) \
+_(attr, total) \
+_(attr, total_L) \
+_(attr, total_length) \
+_(attr, total_weight) \
+_(attr, train) \
+_(attr, training) \
+_(attr, transpose) \
+_(attr, transpose_result) \
+_(attr, transposed) \
+_(attr, type1) \
+_(attr, type2) \
+_(attr, unbiased) \
+_(attr, unitriangular) \
+_(attr, unpack_data) \
+_(attr, unpack_pivots) \
+_(attr, unroll_dim) \
+_(attr, unsafe) \
+_(attr, update) \
+_(attr, upper) \
+_(attr, upscale_factor) \
+_(attr, use_cutlass) \
+_(attr, use_fast_accum) \
+_(attr, use_gelu) \
+_(attr, use_input_stats) \
+_(attr, v) \
+_(attr, value) \
+_(attr, values) \
+_(attr, var) \
+_(attr, vec) \
+_(attr, vec1) \
+_(attr, vec2) \
+_(attr, w_hh) \
+_(attr, w_ih) \
+_(attr, weight) \
+_(attr, weight0) \
+_(attr, weight1) \
+_(attr, weight2) \
+_(attr, weight3) \
+_(attr, weight4) \
+_(attr, weight_arr) \
+_(attr, weight_buf) \
+_(attr, weight_decay) \
+_(attr, weight_g) \
+_(attr, weight_scale) \
+_(attr, weight_stride0) \
+_(attr, weight_zero_point) \
+_(attr, weights) \
+_(attr, win_length) \
+_(attr, window) \
+_(attr, window_length) \
+_(attr, window_size) \
+_(attr, window_size_left) \
+_(attr, window_size_right) \
+_(attr, with_replacement) \
+_(attr, workspace) \
+_(attr, wrap) \
+_(attr, x) \
+_(attr, x1) \
+_(attr, x2) \
+_(attr, y) \
+_(attr, z) \
+_(attr, z_state) \
+_(attr, zero_infinity) \
+_(attr, zero_point) \
+_(attr, zero_point_hh) \
+_(attr, zero_point_ih) \
+_(attr, zero_points)
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/blob.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/blob.h
new file mode 100644
index 0000000000000000000000000000000000000000..35ee3b358c9919d834bb4e0c6dfea592321f8402
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/blob.h
@@ -0,0 +1,204 @@
+#pragma once
+
+#include <type_traits>
+
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/typeid.h>
+#include <c10/macros/Macros.h>
+
+namespace caffe2 {
+
+class Tensor;
+
+/**
+ * @brief Blob is a general container that hosts a typed pointer.
+ *
+ * A Blob hosts a pointer as well as its type, and takes charge of deleting it
+ * properly when the blob is deallocated or re-allocated with a new type. A blob
+ * could contain anything, although the most common case is to contain a Tensor.
+ */
+class TORCH_API Blob final : public c10::intrusive_ptr_target {
+ public:
+  /**
+   * Initializes an empty Blob.
+   */
+  Blob() noexcept : meta_() {}
+  ~Blob() override {
+    Reset();
+  }
+
+  Blob(Blob&& other) noexcept : Blob() {
+    swap(other);
+  }
+
+  Blob& operator=(Blob&& other) noexcept {
+    Blob(std::move(other)).swap(*this);
+    return *this;
+  }
+
+  /**
+   * Checks if the content stored in the blob is of type T.
+   */
+  template <class T>
+  bool IsType() const noexcept {
+    return meta_.Match<T>();
+  }
+
+  /**
+   * Returns the meta info of the blob.
+   */
+  const TypeMeta meta() const noexcept {
+    return meta_;
+  }
+
+  /**
+   * Returns a printable typename of the blob.
+   */
+  c10::string_view TypeName() const noexcept {
+    return meta_.name();
+  }
+
+  /**
+   * @brief Gets the const reference of the stored object. The code checks if
+   * the stored object is of the desired type.
+   */
+  // TODO(jerryzh): add a Get(c10::DeviceType) function?
+  template <class T>
+  const T& Get() const {
+    TORCH_INTERNAL_ASSERT(
+        IsType<T>(),
+        "wrong type for the Blob instance. Blob contains ",
+        meta_.name(),
+        " while caller expects ",
+        TypeMeta::TypeName<T>());
+    // TODO: after we add Get<Tensor>(c10::DeviceType)
+    // and changed all the callsites, we can add
+    // a static assert here to enforce T != Tensor
+    return *static_cast<const T*>(pointer_);
+  }
+
+  const void* GetRaw() const noexcept {
+    return pointer_;
+  }
+  void* GetRaw() noexcept {
+    return pointer_;
+  }
+
+  /**
+   * @brief Gets a mutable pointer to the stored object.
+   *
+   * If the current object is not of the right type, a new object is created
+   * and the old object is freed. Note that type T should have a default
+   * constructor. Otherwise, create the object yourself first, and use
+   * Reset().
+   */
+  template <class T>
+  T* GetMutable() {
+    static_assert(
+        std::is_default_constructible<T>::value,
+        "GetMutable can't be called with non-default-constructible types. "
+        "Try using specialized methods");
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      // TODO Re-enable logging
+      // VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<T>();
+      return Reset<T>(new T());
+    }
+  }
+
+  template <class T>
+  T* GetMutableOrNull() {
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * Sets the underlying object to the allocated one. The Blob then takes over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * This is used when the underlying class T does not have a default ctor, or
+   * complex initializations needs to be done outside the blob.
+   */
+  template <class T>
+  T* Reset(T* allocated) {
+    free_();
+    meta_ = TypeMeta::Make<T>();
+    pointer_ = static_cast<void*>(allocated);
+    has_ownership_ = true;
+    return allocated;
+  }
+
+  /**
+   * Sets the underlying object to the allocated one, but does not take over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * Unlike Reset, this does not take over the ownership of the pointer and the
+   * caller is responsible for making sure that the lifetime of the allocated
+   * blob outlasts the lifetime of any access to this blob, until another Reset
+   * call is made or the blob is destructed.
+   */
+  template <class T>
+  std::remove_const_t<T>* ShareExternal(
+      std::remove_const_t<T>* allocated) {
+    return static_cast<T*>(ShareExternal(
+        static_cast<void*>(allocated),
+        TypeMeta::Make<std::remove_const_t<T>>()));
+  }
+
+  void* ShareExternal(void* allocated, const TypeMeta meta) {
+    free_();
+    meta_ = meta;
+    pointer_ = allocated;
+    has_ownership_ = false;
+    return allocated;
+  }
+
+  /**
+   * Resets the Blob to an empty one.
+   */
+  void Reset() {
+    free_();
+    pointer_ = nullptr;
+    meta_ = TypeMeta();
+    has_ownership_ = false;
+  }
+
+  /**
+   * @brief Swaps the underlying storage of two blobs.
+   */
+  void swap(Blob& rhs)  noexcept {
+    using std::swap;
+    swap(meta_, rhs.meta_);
+    swap(pointer_, rhs.pointer_);
+    swap(has_ownership_, rhs.has_ownership_);
+  }
+
+ private:
+  void free_() {
+    if (has_ownership_ && pointer_ != nullptr) {
+      (*meta_.deleteFn())(pointer_);
+    }
+  }
+
+  TypeMeta meta_;
+  void* pointer_{nullptr};
+  bool has_ownership_{false};
+
+  C10_DISABLE_COPY_AND_ASSIGN(Blob);
+};
+
+inline void swap(Blob& lhs, Blob& rhs)  noexcept {
+  lhs.swap(rhs);
+}
+
+inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
+  return out << "Blob[" << v.TypeName() << "]";
+}
+
+} // namespace caffe2
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/function.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/function.h
new file mode 100644
index 0000000000000000000000000000000000000000..01e395bcf61064acbe0fa2033cec23e4b9529d38
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/function.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/qualified_name.h>
+#include <c10/util/Exception.h>
+#include <c10/util/FunctionRef.h>
+
+namespace c10 {
+struct FunctionSchema;
+};
+
+namespace at {
+TORCH_API void launch(std::function<void()> func);
+}
+
+namespace torch::jit {
+
+struct Graph;
+struct Code;
+
+namespace mobile {
+struct Code;
+}
+
+using Stack = std::vector<at::IValue>;
+using Kwargs = std::unordered_map<std::string, at::IValue>;
+struct RecursiveMethodCallError : public std::exception {};
+using TaskLauncher = std::function<void(std::function<void()>)>;
+
+TORCH_API void preoptimizeGraph(
+    std::shared_ptr<Graph>& graph,
+    bool disable_autocast = false);
+
+// A Function is a pure Graph with no implicit `self` object bound.
+// It contains schema information and the executor that manages the
+// execution of the function. Method is a wrapper around an
+// underlying Function that also provides a `self` object.
+struct TORCH_API Function {
+  Function() = default;
+  Function(const Function&) = default;
+  Function& operator=(const Function&) = default;
+  Function(Function&&) noexcept = default;
+  Function& operator=(Function&&) noexcept = default;
+  virtual c10::string_view doc_string() const {
+    static constexpr c10::string_view no_doc_string = "";
+    return no_doc_string;
+  }
+
+  virtual bool isGraphFunction() const {
+    return false;
+  }
+
+  virtual void run(Stack& stack) = 0;
+
+  virtual c10::intrusive_ptr<c10::ivalue::Future> runAsync(
+      Stack& /*stack*/,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
+      C10_UNUSED TaskLauncher taskLauncher = at::launch) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
+    return {};
+  }
+
+  at::IValue operator()(Stack stack, const Kwargs& kwargs = Kwargs()) {
+    getSchema().checkAndNormalizeInputs(stack, kwargs);
+    run(stack);
+    return stack.front();
+  }
+
+  virtual const c10::QualifiedName& qualname() const = 0;
+
+  const std::string& name() const {
+    return qualname().name();
+  }
+
+  // if this isn't yet defined, run its method_creator function
+  virtual void ensure_defined() = 0;
+
+  virtual const c10::FunctionSchema& getSchema() const = 0;
+
+  virtual size_t num_inputs() const = 0;
+
+  virtual Function& setSchema(c10::FunctionSchema schema) = 0;
+
+  // call() defines how different interpreter implementations interacts with
+  // Function objects. Basically interpreters need to provide a callback to
+  // communicate to Functions what to do if provided a Code object.
+  // Alternatively we could design the signature to return an optional Code
+  // object, but that requires special handling the null case in interpreter
+  // and the fallback behavior is not well defined by interpreter but rather
+  // Function themselves, so a callback approach is more reasonable than
+  // returning values.
+  // If call() returns true, then callback completes successfully, otherwise
+  // call() returns false.
+
+  // Overload for server interpreter, a bailout size is needed for graph
+  // executor.
+  virtual bool call(
+      Stack&,
+      std::optional<size_t>,
+      c10::function_ref<void(const Code&)>) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
+    return false;
+  }
+
+  // Overload for mobile interpreter.
+  virtual bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
+    return false;
+  }
+
+  virtual ~Function() = default;
+};
+} // namespace torch::jit
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/functional.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ddc67418201024f6f94907340689e81f0493c35
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/functional.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <vector>
+#include <c10/util/ArrayRef.h>
+
+namespace c10 {
+
+// The passed in function must take T by value (T), or by
+// const reference (const T&); taking T by non-const reference
+// will result in an error like:
+//
+//    error: no type named 'type' in 'class std::invoke_result<foobar::__lambda, T>'
+//
+// No explicit template parameters are required.
+
+// Overload for explicit function and ArrayRef
+template<class F, class T>
+inline auto fmap(const T& inputs, const F& fn) -> std::vector<decltype(fn(*inputs.begin()))> {
+  std::vector<decltype(fn(*inputs.begin()))> r;
+  r.reserve(inputs.size());
+  for(const auto & input : inputs)
+    r.push_back(fn(input));
+  return r;
+}
+
+// C++ forbids taking an address of a constructor, so here's a workaround...
+// Overload for constructor (R) application
+template<typename R, typename T>
+inline std::vector<R> fmap(const T& inputs) {
+  std::vector<R> r;
+  r.reserve(inputs.size());
+  for(auto & input : inputs)
+    r.push_back(R(input));
+  return r;
+}
+
+template<typename F, typename T>
+inline std::vector<T> filter(at::ArrayRef<T> inputs, const F& fn) {
+  std::vector<T> r;
+  r.reserve(inputs.size());
+  for(auto & input : inputs) {
+    if (fn(input)) {
+      r.push_back(input);
+    }
+  }
+  return r;
+}
+
+template<typename F, typename T>
+inline std::vector<T> filter(const std::vector<T>& inputs, const F& fn) {
+  return filter<F, T>(static_cast<at::ArrayRef<T>>(inputs), fn);
+}
+
+} // namespace c10
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/ivalue_inl.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/ivalue_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d30d3ba5cafef770b1be6652ea792ba2094ecb9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/ivalue_inl.h
@@ -0,0 +1,2539 @@
+#pragma once
+
+#include <condition_variable>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include <ATen/core/Dict.h>
+#include <ATen/core/List.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/qualified_name.h>
+#include <ATen/core/rref_interface.h>
+#include <ATen/core/symbol.h>
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Event.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Stream.h>
+#include <c10/core/StreamGuard.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/FunctionRef.h>
+#include <c10/util/Logging.h>
+#include <c10/util/hash.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/irange.h>
+
+namespace torch {
+namespace jit {
+struct Function;
+struct CompilationUnit;
+} // namespace jit
+TORCH_API bool isCustomClass(const c10::IValue& v);
+} // namespace torch
+namespace c10 {
+struct IValue;
+struct ClassType;
+struct TupleType;
+struct EnumType;
+struct InferredType;
+
+// For custom class __init__ registration, we need to pass in a function
+// that looks like this: [](IValue x, args...)
+
+// However, make_boxed_from_unboxed_functor.h automatically sets the input types
+// of the function by introspecting the types of the functor (which is IValue in
+// this case). However, we need the type it binds to be Foo.
+
+// Instead, we pass in a lambda [](ivalue_holder<CurClass> x, args...) from
+// which getTypePtr can recover the original class pointer.
+
+template <typename TaggedCapsuleType>
+struct tagged_capsule {
+  IValue ivalue;
+};
+
+template <class T, class NullType>
+c10::intrusive_ptr<T, NullType> IValue::moveToIntrusivePtr() {
+  auto t = c10::intrusive_ptr<T, NullType>::reclaim(
+      payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()
+      ? NullType::singleton()
+      : static_cast<T*>(payload.u.as_intrusive_ptr));
+  clearToNone();
+  return t;
+}
+template <typename T, class NullType>
+c10::intrusive_ptr<T, NullType> IValue::toIntrusivePtr() const {
+  if (payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()) {
+    return c10::intrusive_ptr<T, NullType>();
+  }
+  c10::raw::intrusive_ptr::incref(payload.u.as_intrusive_ptr);
+  return c10::intrusive_ptr<T, NullType>::reclaim(
+      static_cast<T*>(payload.u.as_intrusive_ptr));
+}
+
+template <class T, class U>
+intrusive_ptr<T> static_intrusive_pointer_cast(intrusive_ptr<U> r) {
+  return intrusive_ptr<T>::reclaim(static_cast<T*>(r.release()));
+}
+
+template <class T, class U>
+intrusive_ptr<T> dynamic_intrusive_pointer_cast(intrusive_ptr<U> r) {
+  return intrusive_ptr<T>::reclaim(dynamic_cast<T*>(r.release()));
+}
+
+inline c10::intrusive_ptr<ivalue::Future> IValue::toFuture() && {
+  AT_ASSERT(isFuture(), "Expected Future but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::Future>();
+}
+inline c10::intrusive_ptr<ivalue::Future> IValue::toFuture() const& {
+  AT_ASSERT(isFuture(), "Expected Future but got ", tagKind());
+  return toIntrusivePtr<ivalue::Future>();
+}
+inline c10::intrusive_ptr<ivalue::Await> IValue::toAwait() && {
+  AT_ASSERT(isAwait(), "Expected Await but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::Await>();
+}
+inline c10::intrusive_ptr<ivalue::Await> IValue::toAwait() const& {
+  AT_ASSERT(isAwait(), "Expected Await but got ", tagKind());
+  return toIntrusivePtr<ivalue::Await>();
+}
+inline c10::intrusive_ptr<c10::RRefInterface> IValue::toRRef() && {
+  AT_ASSERT(isRRef(), "Expected RRef but got ", tagKind());
+  return moveToIntrusivePtr<c10::RRefInterface>();
+}
+inline c10::intrusive_ptr<c10::RRefInterface> IValue::toRRef() const& {
+  AT_ASSERT(isRRef(), "Expected RRef but got ", tagKind());
+  return toIntrusivePtr<c10::RRefInterface>();
+}
+inline c10::intrusive_ptr<at::Quantizer> IValue::toQuantizer() && {
+  AT_ASSERT(isQuantizer(), "Expected Quantizer but got ", tagKind());
+  return moveToIntrusivePtr<at::Quantizer>();
+}
+inline c10::intrusive_ptr<at::Quantizer> IValue::toQuantizer() const& {
+  AT_ASSERT(isQuantizer(), "Expected Quantizer but got ", tagKind());
+  return toIntrusivePtr<at::Quantizer>();
+}
+inline c10::intrusive_ptr<ivalue::ConstantString> IValue::toString() && {
+  AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::ConstantString>();
+}
+inline c10::intrusive_ptr<ivalue::ConstantString> IValue::toString() const& {
+  AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  return toIntrusivePtr<ivalue::ConstantString>();
+}
+inline c10::intrusive_ptr<ivalue::Object> IValue::toObject() && {
+  AT_ASSERT(isObject(), "Expected Object but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::Object>();
+}
+inline c10::intrusive_ptr<ivalue::Object> IValue::toObject() const& {
+  AT_ASSERT(isObject(), "Expected Object but got ", tagKind());
+  return toIntrusivePtr<ivalue::Object>();
+}
+inline c10::intrusive_ptr<ivalue::PyObjectHolder> IValue::
+    toPyObjectHolder() && {
+  TORCH_INTERNAL_ASSERT(isPyObject(), "Expected PyObject but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::PyObjectHolder>();
+}
+inline c10::intrusive_ptr<ivalue::PyObjectHolder> IValue::toPyObjectHolder()
+    const& {
+  TORCH_INTERNAL_ASSERT(isPyObject(), "Expected PyObject but got ", tagKind());
+  return toIntrusivePtr<ivalue::PyObjectHolder>();
+}
+inline c10::intrusive_ptr<ivalue::EnumHolder> IValue::toEnumHolder() && {
+  TORCH_INTERNAL_ASSERT(isEnum(), "Expected Enum but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::EnumHolder>();
+}
+inline c10::intrusive_ptr<ivalue::EnumHolder> IValue::toEnumHolder() const& {
+  TORCH_INTERNAL_ASSERT(isEnum(), "Expected Enum but got ", tagKind());
+  return toIntrusivePtr<ivalue::EnumHolder>();
+}
+inline c10::complex<double> IValue::toComplexDouble() const {
+  TORCH_INTERNAL_ASSERT(isComplexDouble(), "Expected ComplexDouble but got ", tagKind());
+  auto ptr = toIntrusivePtr<ivalue::ComplexHolder>();
+  return (*ptr).val;
+}
+inline at::Tensor IValue::toTensor() && {
+  if (C10_UNLIKELY(!isTensor())) {
+    reportToTensorTypeError();
+  }
+  auto result = std::move(payload.as_tensor);
+  // As far as I can tell, omitting the usual explicit destructor call
+  // is not UB in and of itself, and it's a slight perf win. The
+  // destructor is a no-op, because the moved-from Tensor is
+  // effectively an intrusive_ptr in the null state, so we don't need
+  // the behavior for correctness reasons either. Leaving this
+  // explanatory comment, including commented-out destructor call, to
+  // make this abundantly clear.
+  //
+  // payload.as_tensor.~Tensor();
+  clearToNone();
+  return result;
+}
+inline at::Tensor& IValue::toTensor() & {
+  if (C10_UNLIKELY(!isTensor())) {
+    reportToTensorTypeError();
+  }
+  return payload.as_tensor;
+}
+inline const at::Tensor& IValue::toTensor() const& {
+  if (C10_UNLIKELY(!isTensor())) {
+    reportToTensorTypeError();
+  }
+  return payload.as_tensor;
+}
+inline c10::Storage IValue::toStorage() && {
+  AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind());
+  return c10::Storage(
+      moveToIntrusivePtr<at::StorageImpl>());
+}
+inline c10::Storage IValue::toStorage() const& {
+  AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind());
+  return c10::Storage(toIntrusivePtr<at::StorageImpl>());
+}
+inline c10::Stream IValue::toStream() && {
+  AT_ASSERT(isStream(), "Expected Stream but got ", tagKind());
+  auto ptr = toIntrusivePtr<ivalue::StreamData3Holder>();
+  return c10::Stream::unpack3((*ptr).val.stream_id,
+                              (*ptr).val.device_index,
+                              (*ptr).val.device_type);
+}
+inline c10::Stream IValue::toStream() const& {
+  AT_ASSERT(isStream(), "Expected Stream but got ", tagKind());
+  auto ptr = toIntrusivePtr<ivalue::StreamData3Holder>();
+  return c10::Stream::unpack3((*ptr).val.stream_id,
+                              (*ptr).val.device_index,
+                              (*ptr).val.device_type);
+}
+inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() && {
+  AT_ASSERT(isBlob(), "Expected Blob but got ", tagKind());
+  return moveToIntrusivePtr<caffe2::Blob>();
+}
+inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() const& {
+  AT_ASSERT(isBlob(), "Expected Blob but got ", tagKind());
+  return toIntrusivePtr<caffe2::Blob>();
+  ;
+}
+inline c10::intrusive_ptr<torch::CustomClassHolder> IValue::toCapsule() && {
+  TORCH_INTERNAL_ASSERT(isCapsule());
+  return moveToIntrusivePtr<torch::CustomClassHolder>();
+}
+inline c10::intrusive_ptr<torch::CustomClassHolder> IValue::toCapsule() const& {
+  TORCH_INTERNAL_ASSERT(isCapsule());
+  return toIntrusivePtr<torch::CustomClassHolder>();
+}
+inline at::Generator IValue::toGenerator() && {
+  AT_ASSERT(isGenerator(), "Expected Generator but got ", tagKind());
+  return at::Generator(moveToIntrusivePtr<at::GeneratorImpl>());
+}
+inline at::Generator IValue::toGenerator() const& {
+  AT_ASSERT(isGenerator(), "Expected Generator but got ", tagKind());
+  return at::Generator(toIntrusivePtr<at::GeneratorImpl>());
+}
+inline c10::SymInt IValue::toSymInt() && {
+  AT_ASSERT(isSymInt() || isInt(), "Expected SymInt or int but got ", tagKind());
+  if (isSymInt()) {
+    return c10::SymInt(moveToIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymInt(payload.u.as_int);
+  }
+}
+inline c10::SymInt IValue::toSymInt() const& {
+  AT_ASSERT(isSymInt() || isInt(), "Expected SymInt or int but got ", tagKind());
+  if (isSymInt()) {
+    return c10::SymInt(toIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymInt(payload.u.as_int);
+  }
+}
+inline c10::SymFloat IValue::toSymFloat() && {
+  AT_ASSERT(isSymFloat() || isDouble(), "Expected SymFloat or double but got ", tagKind());
+  if (isSymFloat()) {
+    return c10::SymFloat(moveToIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymFloat(payload.u.as_double);
+  }
+}
+inline c10::SymFloat IValue::toSymFloat() const& {
+  AT_ASSERT(isSymFloat() || isDouble(), "Expected SymFloat or double but got ", tagKind());
+  if (isSymFloat()) {
+    return c10::SymFloat(toIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymFloat(payload.u.as_double);
+  }
+}
+inline c10::SymBool IValue::toSymBool() && {
+  AT_ASSERT(isSymBool() || isBool(), "Expected SymBool or boolean but got ", tagKind());
+  if (isSymBool()) {
+    return c10::SymBool(moveToIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymBool(payload.u.as_bool);
+  }
+}
+
+inline c10::SymBool IValue::toSymBool() const& {
+  AT_ASSERT(isSymBool() || isBool(), "Expected SymBool or boolean but got ", tagKind());
+  if (isSymBool()) {
+    return c10::SymBool(toIntrusivePtr<c10::SymNodeImpl>());
+  } else {
+    return c10::SymBool(payload.u.as_bool);
+  }
+}
+
+namespace ivalue {
+
+void TORCH_API
+checkCustomClassType(const ClassType* expected_type, const Type* actual_type);
+
+template <typename T>
+using Shared = c10::intrusive_ptr<T>;
+
+// string
+struct TORCH_API ConstantString final : c10::intrusive_ptr_target {
+ private:
+   // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::string str_;
+
+ public:
+  ConstantString(std::string str) : str_(std::move(str)) {}
+  ConstantString(c10::string_view str) : str_(std::string(str)) {}
+  static c10::intrusive_ptr<ConstantString> create(std::string str_);
+  static c10::intrusive_ptr<ConstantString> create(c10::string_view str_);
+  static c10::intrusive_ptr<ConstantString> create(const char* str_);
+
+  const std::string& string() const {
+    return str_;
+  }
+  c10::string_view string_view() const {
+    return str_;
+  }
+
+  operator const std::string&() const {
+    return string();
+  }
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const ConstantString& v);
+};
+
+struct Future;
+
+struct TORCH_API TupleElements {
+ private:
+  size_t inlineSize_;
+  // We represent TupleElements this way to save doing a heap
+  // allocation in the common (at least for unpickling) case where we
+  // have only 3 elements. We have our own union instead of
+  // c10::SmallVector<IValue> because c10::SmallVector<IValue> always
+  // stores the begin/end/capacity pointers, which would be a waste of
+  // space in our use case.
+  union {
+    std::vector<IValue> elementsVector_;
+    // Don't want to declare a std::array because the convenient
+    // iteration and size members are a footgun in this case -- the
+    // actual size of the array may be smaller than 3!
+    // NOLINTNEXTLINE(*c-arrays*)
+    IValue elementsInline_[3];
+  };
+
+  void destroyInline() {
+   for (const auto ii : c10::irange(inlineSize_)) {
+     elementsInline_[ii].~IValue();
+   }
+  }
+ public:
+
+  using iterator = IValue*;
+  using const_iterator = const IValue*;
+
+  TupleElements() : inlineSize_(0) {
+    new (&elementsVector_) std::vector<IValue>();
+  }
+
+  explicit TupleElements(std::vector<IValue> elements)
+  : inlineSize_(0), elementsVector_(std::move(elements)) {}
+
+  explicit TupleElements(c10::ArrayRef<IValue> elements)
+  : inlineSize_(elements.size() <= 3 ? elements.size() : 0) {
+    switch (inlineSize_) {
+      case 3:
+        new (&elementsInline_[2]) IValue(elements[2]);
+        [[fallthrough]];
+      case 2:
+        new (&elementsInline_[1]) IValue(elements[1]);
+        [[fallthrough]];
+      case 1:
+        new (&elementsInline_[0]) IValue(elements[0]);
+        break;
+      case 0:
+        new (&elementsVector_) std::vector<IValue>(elements.begin(), elements.end());
+        break;
+    }
+  }
+
+  explicit TupleElements(IValue&& e1)
+  : inlineSize_(1) {
+    new (&elementsInline_[0]) IValue(std::move(e1));
+  }
+
+  explicit TupleElements(IValue&& e1, IValue&& e2)
+  : inlineSize_(2) {
+    new (&elementsInline_[0]) IValue(std::move(e1));
+    new (&elementsInline_[1]) IValue(std::move(e2));
+  }
+
+  explicit TupleElements(IValue&& e1, IValue&& e2, IValue&& e3)
+  : inlineSize_(3) {
+    new (&elementsInline_[0]) IValue(std::move(e1));
+    new (&elementsInline_[1]) IValue(std::move(e2));
+    new (&elementsInline_[2]) IValue(std::move(e3));
+  }
+
+  ~TupleElements() {
+    if (inlineSize_) {
+      destroyInline();
+    } else {
+      elementsVector_.~vector();
+    }
+  }
+
+  // It would be nice to make this noncopyable to prevent people from
+  // writing code like `auto output =
+  // forward(...).toTupleRef().elements()` (which does refcount bumps on
+  // each element, unlike the more efficient but verbose
+  // ```
+  // auto outputIntrusivePtr = forward(...).toTuple();
+  // const auto& output = outputIntrusivePtr->elements();
+  // ```
+  // ), but there is simply an overwhelming amount of code that does
+  // it the inefficient way.
+  // See also operator std::vector below.
+  TupleElements(const TupleElements& rhs)
+  : inlineSize_(rhs.inlineSize_) {
+    if (rhs.inlineSize_) {
+      for (const auto  ii : c10::irange(inlineSize_)) {
+        new (&elementsInline_[ii]) IValue(rhs.elementsInline_[ii]);
+      }
+    } else {
+      new (&elementsVector_) std::vector<IValue>(rhs.elementsVector_);
+    }
+  }
+
+  TupleElements& operator=(const TupleElements& rhs) {
+    if (inlineSize_) {
+      if (rhs.inlineSize_) {
+        for (const auto ii : c10::irange(std::min(inlineSize_, rhs.inlineSize_))) {
+          elementsInline_[ii] = rhs.elementsInline_[ii];
+        }
+        if (rhs.inlineSize_ > inlineSize_) {
+          for (const auto ii : c10::irange(inlineSize_, rhs.inlineSize_)) {
+            new (&elementsInline_[ii]) IValue(rhs.elementsInline_[ii]);
+          }
+        } else {
+          for (const auto ii : c10::irange(rhs.inlineSize_, inlineSize_)) {
+            elementsInline_[ii].~IValue();
+          }
+        }
+      } else {
+        destroyInline();
+        new (&elementsVector_) std::vector<IValue>(rhs.elementsVector_);
+      }
+    } else {
+      if (rhs.inlineSize_) {
+        elementsVector_.~vector();
+        for (const auto ii : c10::irange(rhs.inlineSize_)) {
+          new (&elementsInline_[ii]) IValue(rhs.elementsInline_[ii]);
+        }
+      } else {
+        elementsVector_ = rhs.elementsVector_;
+      }
+    }
+    inlineSize_ = rhs.inlineSize_;
+    return *this;
+  }
+
+  TupleElements(TupleElements&& rhs) noexcept
+  : inlineSize_(rhs.inlineSize_) {
+    if (inlineSize_) {
+      for (const auto ii : c10::irange(inlineSize_)) {
+        new (&elementsInline_[ii]) IValue(std::move(rhs.elementsInline_[ii]));
+      }
+    } else {
+      new (&elementsVector_) std::vector<IValue>(std::move(rhs.elementsVector_));
+    }
+  }
+
+  TupleElements& operator=(TupleElements&& rhs) noexcept {
+    if (inlineSize_) {
+      if (rhs.inlineSize_) {
+        for (const auto ii : c10::irange(std::min(inlineSize_, rhs.inlineSize_))) {
+          elementsInline_[ii] = std::move(rhs.elementsInline_[ii]);
+        }
+        if (rhs.inlineSize_ > inlineSize_) {
+          for (const auto ii : c10::irange(inlineSize_, rhs.inlineSize_)) {
+            new (&elementsInline_[ii]) IValue(std::move(rhs.elementsInline_[ii]));
+          }
+        } else {
+          for (const auto ii : c10::irange(rhs.inlineSize_, inlineSize_)) {
+            elementsInline_[ii].~IValue();
+          }
+        }
+      } else {
+        destroyInline();
+        new (&elementsVector_) std::vector<IValue>(std::move(rhs.elementsVector_));
+      }
+    } else {
+      if (rhs.inlineSize_) {
+        elementsVector_.~vector();
+        for (const auto ii : c10::irange(rhs.inlineSize_)) {
+          new (&elementsInline_[ii]) IValue(std::move(rhs.elementsInline_[ii]));
+        }
+      } else {
+        elementsVector_ = std::move(rhs.elementsVector_);
+      }
+    }
+    inlineSize_ = rhs.inlineSize_;
+    return *this;
+  }
+
+  C10_NODISCARD c10::ArrayRef<IValue> asArrayRef() const {
+    if (inlineSize_) {
+      return c10::ArrayRef<IValue>(elementsInline_, inlineSize_);
+    } else {
+      return elementsVector_;
+    }
+  }
+
+  // Mimic implicit conversion from std::vector to ArrayRef.
+  operator c10::ArrayRef<IValue>() const {
+    return asArrayRef();
+  }
+
+  static size_t hash(const TupleElements& v) {
+    return c10::hash<c10::ArrayRef<IValue>>()(v.asArrayRef());
+  }
+
+  void setContents(std::vector<IValue>&& contents) {
+    if (inlineSize_) {
+      destroyInline();
+      new (&elementsVector_) std::vector<IValue>(std::move(contents));
+      inlineSize_ = 0;
+    } else {
+      elementsVector_ = std::move(contents);
+    }
+  }
+
+  C10_NODISCARD bool empty() const {
+    return inlineSize_ ? false : elementsVector_.empty();
+  }
+
+  C10_NODISCARD size_t size() const {
+    return inlineSize_ ? inlineSize_ : elementsVector_.size();
+  }
+
+  C10_NODISCARD IValue& operator[](size_t idx) {
+    if (inlineSize_) {
+      return elementsInline_[idx];
+    } else {
+      return elementsVector_[idx];
+    }
+  }
+
+  C10_NODISCARD const IValue& operator[](size_t idx) const {
+    if (inlineSize_) {
+      return elementsInline_[idx];
+    } else {
+      return elementsVector_[idx];
+    }
+  }
+
+  C10_NODISCARD IValue& at(size_t idx) {
+    if (inlineSize_) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inlineSize_ <= 3);
+      TORCH_CHECK(idx < inlineSize_, "TupleElements: invalid index Index = ", idx, "; Length = ", inlineSize_);
+      return elementsInline_[idx];
+    } else {
+      return elementsVector_.at(idx);
+    }
+  }
+
+  C10_NODISCARD const IValue& at(size_t idx) const {
+    if (inlineSize_) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(inlineSize_ <= 3);
+      TORCH_CHECK(idx < inlineSize_, "TupleElements: invalid index Index = ", idx, "; Length = ", inlineSize_);
+      return elementsInline_[idx];
+    } else {
+      TORCH_CHECK(idx < elementsVector_.size(), "TupleElements: invalid index Index = ", idx, "; Length = ", elementsVector_.size());
+      return elementsVector_.at(idx);
+    }
+  }
+
+  C10_NODISCARD iterator begin() {
+    if (inlineSize_) {
+      return elementsInline_;
+    } else {
+      return elementsVector_.data();
+    }
+  }
+
+  C10_NODISCARD iterator end() {
+    if (inlineSize_) {
+      return elementsInline_ + inlineSize_;
+    } else {
+      return elementsVector_.data() + elementsVector_.size();
+    }
+  }
+
+  C10_NODISCARD const_iterator begin() const {
+    if (inlineSize_) {
+      return elementsInline_;
+    } else {
+      return elementsVector_.data();
+    }
+  }
+
+  C10_NODISCARD const_iterator end() const {
+    if (inlineSize_) {
+      return elementsInline_ + inlineSize_;
+    } else {
+      return elementsVector_.data() + elementsVector_.size();
+    }
+  }
+
+  C10_NODISCARD const_iterator cbegin() const {
+    return begin();
+  }
+
+  C10_NODISCARD const_iterator cend() const {
+    return end();
+  }
+
+  C10_NODISCARD std::vector<IValue> vec() const & {
+    return asArrayRef().vec();
+  }
+
+  C10_NODISCARD IValue& back() {
+    return *(end() - 1);
+  }
+
+  C10_NODISCARD const IValue& back() const {
+    return *(end() - 1);
+  }
+
+  C10_NODISCARD std::vector<IValue> vec() && {
+    std::vector<IValue> result;
+    result.reserve(size());
+    for (auto&& iv : *this) {
+      result.push_back(std::move(iv));
+    }
+    return result;
+  }
+
+  // More compatibility shims for the overwhelming amount of code that
+  // likes to copy tuple elements into a vector; see comment above the
+  // copy constructor.
+  operator std::vector<IValue>() const & {
+    return vec();
+  }
+
+  operator std::vector<IValue>() && {
+    return vec();
+  }
+};
+
+template <typename T>
+struct TupleTypeFactory {};
+
+template <>
+struct TORCH_API TupleTypeFactory<TupleType> {
+  static TupleTypePtr create(std::vector<TypePtr> types) {
+    return TupleType::create(std::move(types));
+  }
+  static TupleTypePtr fallback(const Type& type);
+};
+
+template <>
+struct TORCH_API TupleTypeFactory<c10::DynamicType> {
+  static DynamicTypePtr create(const std::vector<TypePtr>& elemTypes);
+  static DynamicTypePtr fallback(const Type&);
+};
+
+struct TORCH_API Tuple : c10::intrusive_ptr_target {
+ private:
+  TupleElements elements_;
+  mutable c10::TypePtr type_; // lazily computed for unnamed tuples
+
+ public:
+  // named tuples have additional type information, so we
+  // directly create them tagged
+  static c10::intrusive_ptr<Tuple> createNamed(
+      std::vector<IValue> elements_,
+      c10::TypePtr type_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_), std::move(type_));
+  }
+
+  static c10::intrusive_ptr<Tuple> createNamed(
+      TupleElements elements_,
+      std::shared_ptr<TupleType> type_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_), std::move(type_));
+  }
+
+  static c10::intrusive_ptr<Tuple> createNamed(
+      std::initializer_list<IValue> elements_,
+      std::shared_ptr<TupleType> type_) {
+    return createNamed(TupleElements(c10::ArrayRef<IValue>(elements_)), std::move(type_));
+  }
+
+  // MSVC apparently can't disambiguate the other two overloads of
+  // create when passed an initializer_list without this.
+  static c10::intrusive_ptr<Tuple> create(std::initializer_list<IValue> elements_) {
+    return create(c10::ArrayRef<IValue>(elements_));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(std::vector<IValue> elements_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(TupleElements elements_) {
+    return c10::make_intrusive<Tuple>(std::move(elements_));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(c10::ArrayRef<IValue> elements_) {
+    return create(TupleElements(elements_));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(IValue e1) {
+    return c10::make_intrusive<Tuple>(std::move(e1));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(IValue e1, IValue e2) {
+    return c10::make_intrusive<Tuple>(std::move(e1), std::move(e2));
+  }
+
+  static c10::intrusive_ptr<Tuple> create(IValue e1, IValue e2, IValue e3) {
+    return c10::make_intrusive<Tuple>(std::move(e1), std::move(e2), std::move(e3));
+  }
+
+ private:
+  // Workaround inability to use `>` operator in template argument list.
+  template <typename... Args>
+  static constexpr bool hasMoreThanThreeArgs() {
+    return sizeof...(Args) > 3;
+  }
+
+ public:
+  template <typename... Args>
+  static c10::intrusive_ptr<Tuple> create(Args&&... elements_) {
+    switch (sizeof...(Args)) {
+      case 1:
+      case 2:
+      case 3:
+        return create(IValue(std::forward<Args>(elements_))...);
+      default:
+        return create(
+            std::vector<IValue>{IValue(std::forward<Args>(elements_))...});
+    }
+  }
+
+  // Again, it would be nice to make this noncopyable, but there's a
+  // lot of extant code that copies Tuples.
+  // Tuple(const Tuple& rhs) = delete;
+
+  const TupleElements& elements() const& {
+    return elements_;
+  }
+
+  TupleElements elements() && {
+    return std::move(elements_);
+  }
+
+  void setElements(std::vector<IValue>&& elements) {
+    elements_.setContents(std::move(elements));
+  }
+
+  void setElements(TupleElements&& elements) {
+    elements_ = std::move(elements);
+  }
+
+  void unsafeSetElement(size_t idx, const IValue& element) {
+    elements_[idx] = element;
+  }
+
+  void unsafeSetElement(size_t idx, IValue&& element) {
+    elements_[idx] = std::move(element);
+  }
+
+  size_t size() const {
+    return elements_.size();
+  }
+
+  template <typename T = c10::TupleType>
+  std::shared_ptr<T> type() const {
+    if (!type_) {
+      type_ = TupleTypeFactory<T>::create(fmap(elements(), [&](const IValue& v) {
+        return v.type<typename T::ElementType>();
+      }));
+    }
+    if (auto t = type_->cast<T>()) {
+      return t;
+    }
+    return TupleTypeFactory<T>::fallback(*type_);
+  }
+
+  static size_t hash(const Tuple& t) {
+    return c10::get_hash(t.elements());
+  }
+
+  TORCH_API friend bool operator==(
+      const ivalue::Tuple& lhs,
+      const ivalue::Tuple& rhs);
+
+ private:
+  // NOTE: If we try to avoid the overloads without
+  // `std::shared_ptr<TupleType> type` by defaulting it to nullptr, we
+  // end up having to call (part of) the shared_ptr destructor for
+  // `type` even though we should know statically it won't do
+  // anything.
+  explicit Tuple(std::vector<IValue> elements)
+    : elements_(std::move(elements)){}
+
+  explicit Tuple(std::vector<IValue> elements, c10::TypePtr type)
+    : elements_(std::move(elements)), type_(std::move(type)) {}
+
+  explicit Tuple(TupleElements&& elements)
+    : elements_(std::move(elements)) {}
+
+  explicit Tuple(TupleElements&& elements, std::shared_ptr<TupleType> type)
+    : elements_(std::move(elements)), type_(std::move(type)) {}
+
+  explicit Tuple(IValue&& e1)
+    : elements_(std::move(e1)) {}
+
+  explicit Tuple(IValue&& e1, std::shared_ptr<TupleType> type)
+    : elements_(std::move(e1)), type_(std::move(type)) {}
+
+  explicit Tuple(IValue&& e1, IValue&& e2)
+    : elements_(std::move(e1), std::move(e2)) {}
+
+  explicit Tuple(IValue&& e1, IValue&& e2, std::shared_ptr<TupleType> type)
+    : elements_(std::move(e1), std::move(e2)), type_(std::move(type)) {}
+
+  explicit Tuple(IValue&& e1, IValue&& e2, IValue&& e3)
+    : elements_(std::move(e1), std::move(e2), std::move(e3)) {}
+
+  explicit Tuple(IValue&& e1, IValue&& e2, IValue&& e3, std::shared_ptr<TupleType> type)
+    : elements_(std::move(e1), std::move(e2), std::move(e3)), type_(std::move(type)) {}
+
+  friend class c10::intrusive_ptr<Tuple>;
+};
+
+struct Object;
+struct PyObjectHolder;
+struct EnumHolder;
+} // namespace ivalue
+
+// Future
+struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
+ private:
+  // Keep this private in order to force users to go through make_intrusive and
+  // thus prevent creating a Future that's not held by an intrusive_ptr.
+  explicit Future(TypePtr type, std::vector<c10::Device> devices={})
+      : type_(std::move(type)),
+        impl_(getTypeOfDevices(devices)),
+        devices_(sortAndDeduplicateDevices(impl_, std::move(devices))) {}
+
+  friend c10::intrusive_ptr<Future>;
+
+  struct FutureCallback {
+    std::function<void(Future&)> callback;
+    bool uses_future; // whether the Future& passed in is actually used
+
+    template <typename T>
+    FutureCallback(T callback, bool uses_future)
+        : callback(std::move(callback)), uses_future(uses_future) {}
+  };
+
+ public:
+  Future(const Future&) = delete;
+  Future(Future&&) = delete;
+  Future& operator=(const Future&) = delete;
+  Future& operator=(Future&&) = delete;
+
+  struct TORCH_API FutureError final : public std::exception {
+    explicit FutureError(std::string&& error_msg_)
+        : error_msg(std::move(error_msg_)) {}
+
+    FutureError() = default;
+
+    const char* what() const noexcept override {
+      return error_msg.c_str();
+    }
+
+    std::string error_msg;
+  };
+
+  /**
+   * Wait on the future until it completes.
+   */
+  void wait() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    finished_cv_.wait(lock, [&]() -> bool { return completed_; });
+    synchronizeWithCurrentStreams();
+  }
+
+  /**
+   * Wait on the future until it completes and throw an
+   * exception if an error exists.
+   */
+  void waitAndThrow() {
+    wait();
+
+    if (eptr_) {
+      std::rethrow_exception(eptr_);
+    }
+  }
+
+  /**
+   * Explicitly mark the future as completed with the output value. Optionally,
+   * the storages for all tensors in IValue can be passed as well. The DataPtrs
+   * of these storages are used to synchronize CUDA streams. If storages isn't
+   * given we will attempt to extract it from the value, if we need to (this
+   * happens if a non-empty set of devices was given to the constructor). Thus
+   * one only needs to provide storages when 1) they cannot be extracted through
+   * IValue::getSubValues() or through pickling in case of Python object; or
+   * when 2) customized storage extraction is more efficient.
+   */
+  using WeakStorage = c10::weak_intrusive_ptr<c10::StorageImpl>;
+  void markCompleted(
+      IValue value,
+      std::optional<std::vector<WeakStorage>> storages = std::nullopt) {
+    // Start by performing all steps that can throw, before setting any field.
+    // Do this before even acquiring the mutex, because extractStorages might
+    // acquire the GIL, which could lead to a lock inversion with our mutex.
+    // See https://github.com/pytorch/pytorch/issues/58239.
+    std::vector<WeakStorage> actualStorages;
+    std::vector<c10::Device> usedDevices;
+    try {
+      // FIXME We should always extract DataPtrs, in order to catch the case of
+      // users using CUDA values but forgetting to set devices, which currently
+      // leads to a silent synchronization/correctness issue. However, as this
+      // might worsen perf in CPU-only cases, we should only do so after careful
+      // benchmarks.
+      if (impl_.type() != c10::kCPU) {
+        actualStorages =
+            storages.has_value() ? std::move(*storages) : extractStorages(value);
+        usedDevices = getDevicesOfStorages(impl_, actualStorages);
+        ensureIsSubsetOfDevices(usedDevices, devices_);
+      }
+    } catch (const std::exception&) {
+      setError(std::current_exception());
+      return;
+    }
+
+    std::unique_lock<std::mutex> lock(mutex_);
+    TORCH_CHECK(
+        !completed(),
+        "Attempting to mark a completed Future as complete again. Note that "
+        "a Future can only be marked completed once.");
+
+    // Only set value_ and completed_ flag once all checks and preparation steps
+    // have returned successfully to allow for proper error propagation.
+    value_ = std::move(value);
+    completed_ = true;
+
+    currentDevice_ = impl_.getDevice();
+    storages_ = std::move(actualStorages);
+    for (const c10::Device& device : usedDevices) {
+      c10::Event event(impl_.type());
+      event.record(impl_.getStream(device));
+      events_.push_back(std::move(event));
+    }
+
+    std::vector<FutureCallback> cbs;
+    cbs.swap(callbacks_);
+    lock.unlock();
+
+    finished_cv_.notify_all();
+    for (auto& callback : cbs) {
+      invokeCallback(std::move(callback.callback), callback.uses_future);
+    }
+  }
+
+  void markCompleted() {
+    markCompleted(IValue{});
+  }
+
+  void setError(std::exception_ptr eptr) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    setErrorInternal(std::move(eptr), lock);
+  }
+
+  void setErrorIfNeeded(std::exception_ptr eptr) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (completed_) {
+      // This should be rare and shouldn't cause log spew. Its important to
+      // log errors and thats why we have this log here.
+      std::string msg = c10::str(
+          "Skipping setting following error on the Future since "
+          "it is already marked completed (this is not necessarily "
+          "an error):\n",
+          tryRetrieveErrorMessageInternal(std::move(eptr)));
+      if (eptr_) {
+        msg += c10::str(
+            ", \nOriginal exception:\n",
+            tryRetrieveErrorMessageInternal(eptr_));
+      }
+      LOG(INFO) << msg;
+      return;
+    } else {
+      setErrorInternal(std::move(eptr), lock);
+    }
+  }
+
+  // Get the result of the current future.
+  IValue value() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    AT_ASSERT(completed());
+    if (eptr_) {
+      std::rethrow_exception(eptr_);
+    }
+    return value_;
+  }
+
+  // This accessor should only be used if we know that the future is
+  // completed() with no error.
+  const IValue& constValue() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    AT_ASSERT(completed());
+    TORCH_INTERNAL_ASSERT(
+      !eptr_,
+      "value() accessor should only be used when future is not completed with ",
+      "an error, but future had the following error: ",
+      tryRetrieveErrorMessageInternal(eptr_)
+    );
+    return value_;
+  }
+
+  // This accessor should only be used if we know that the future is
+  // completed() with no error.
+  const std::vector<WeakStorage>& storages() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    AT_ASSERT(completed());
+    AT_ASSERT(!eptr_);
+    return storages_;
+  }
+
+  /**
+   * Add a callback to the future.
+   * The callbacks will be executed once the future completes.
+   * If the future has already completed,
+   * this function will execute the callback immediately.
+   */
+  template <typename T>
+  void addCallback(T callback, bool uses_future = true) {
+    static_assert(
+        std::is_invocable_r<void, T, Future&>::value,
+        "The callback must have signature void(Future&)");
+
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (completed()) {
+      lock.unlock();
+      invokeCallback(std::move(callback), uses_future);
+      return;
+    }
+    callbacks_.emplace_back(std::move(callback), uses_future);
+  }
+
+  /**
+   * Add a callback to the future, and return another Future to hold the return
+   * value of the callback. This is necessary when the callback provider needs
+   * to know for sure when the callback has finished.
+   */
+  template <typename T>
+  c10::intrusive_ptr<Future> then(T callback, TypePtr type) {
+    using IValueWithStorages = std::tuple<IValue, std::vector<WeakStorage>>;
+    static_assert(
+        std::disjunction<
+            std::is_invocable_r<IValue, T, Future&>,
+            std::is_invocable_r<IValueWithStorages, T, Future&>>::value,
+        "The callback must have signature IValue(Future&) or "
+        "std::tuple<IValue, std::vector<Storage>>(Future&)");
+
+    auto childFut = createInstance(::std::move(type));
+    addCallback([childFut,
+                 cb = std::move(callback)](Future& parentFut) mutable {
+      try {
+        if constexpr (::std::is_convertible_v<typename std::invoke_result_t<T &&, Future&>, IValueWithStorages>) {
+          auto [ivalue, storages] = cb(parentFut);
+          childFut->markCompleted(::std::move(ivalue), ::std::move(storages));
+        } else {
+          childFut->markCompleted(cb(parentFut));
+        }
+      } catch (std::exception&) {
+        childFut->setError(std::current_exception());
+      }
+    });
+    return childFut;
+  }
+
+  template <typename T>
+  c10::intrusive_ptr<Future> thenAsync(T callback, TypePtr type) {
+    static_assert(
+        std::is_invocable_r<c10::intrusive_ptr<Future>, T, Future&>::value,
+        "The callback must have signature c10::intrusive_ptr<Future>(Future&)");
+
+    auto childFut = createInstance(std::move(type));
+    addCallback(
+        [childFut, cb = std::move(callback)](Future& parentFut) mutable {
+          c10::intrusive_ptr<Future> intermediateFut;
+          try {
+            intermediateFut = cb(parentFut);
+          } catch (std::exception&) {
+            childFut->setError(std::current_exception());
+            return;
+          }
+          intermediateFut->addCallback(
+              [childFut = std::move(childFut)](Future& intermediateFut) {
+                if (intermediateFut.hasError()) {
+                  childFut->setError(intermediateFut.exception_ptr());
+                } else {
+                  childFut->markCompleted(
+                      intermediateFut.value(), intermediateFut.storages());
+                }
+              });
+        });
+    return childFut;
+  }
+
+  // Tries to retrieve the error message from std::exception_ptr.
+  std::string tryRetrieveErrorMessage() const {
+    TORCH_CHECK(hasError(), "No error present on the future.");
+    std::unique_lock<std::mutex> lock(mutex_);
+    return tryRetrieveErrorMessageInternal(eptr_);
+  }
+
+  // Check if the current future has completed
+  bool completed() const {
+    return completed_;
+  }
+
+  bool hasValue() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return completed_ && !eptr_;
+  }
+
+  bool hasError() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return eptr_ ? true : false;
+  }
+
+  std::exception_ptr exception_ptr() const {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return eptr_;
+  }
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const Future& v);
+
+  const TypePtr& elementType() const {
+    return type_;
+  }
+
+  const std::vector<c10::Device>& devices() const {
+    return devices_;
+  }
+
+  // This method should be used when one intends to manually create a child
+  // future, for example when implementing a customized version of then().
+  c10::intrusive_ptr<Future> createInstance(at::TypePtr type) {
+    return c10::make_intrusive<Future>(std::move(type), devices_);
+  }
+
+ private:
+
+  // This method should always be used when invoking a callback (regardless of
+  // how/when that happens) as it will ensure that the proper "environment" is
+  // set up before running the callback, as in, it will set up the CUDA streams,
+  // synchronize them with the value, and so on (if needed).
+  template<typename T>
+  void invokeCallback(T callback, bool uses_future) {
+    static_assert(
+        std::is_invocable_r<void, T, Future&>::value,
+        "The callback must have signature void(Future&)");
+
+    // The synchronization performed below shouldn't be needed when the future
+    // is not used by the callback.
+    if (uses_future) {
+      c10::OptionalDeviceGuard deviceGuard(currentDevice_);
+
+      std::vector<c10::Stream> streams;
+      streams.reserve(devices_.size());
+      for (const c10::Device& device : devices_) {
+        streams.push_back(impl_.getStreamFromGlobalPool(device));
+      }
+      c10::MultiStreamGuard streamGuard(streams);
+      synchronizeWithCurrentStreams();
+      callback(*this);
+    } else {
+      callback(*this);
+    }
+  }
+
+  // This method should be called before this future's value is used, as it
+  // ensures that the CUDA streams that are "current" at the callsite properly
+  // synchronize with the value.
+  void synchronizeWithCurrentStreams() {
+    for (c10::Event& event : events_) {
+      event.block(impl_.getStream(event.device()));
+    }
+
+    for (const WeakStorage& weak_storage : storages_) {
+      c10::intrusive_ptr<c10::StorageImpl> storage = weak_storage.lock();
+      if (!storage) {
+        continue;
+      }
+      if (!storage->device().is_cpu()) {
+        impl_.recordDataPtrOnStream(
+            storage->data_ptr(), impl_.getStream(storage->device()));
+      }
+    }
+  }
+
+  void setErrorInternal(
+      std::exception_ptr eptr,
+      std::unique_lock<std::mutex>& lock) {
+    TORCH_CHECK(
+        !eptr_,
+        "Error already set on this Future: ",
+        tryRetrieveErrorMessageInternal(eptr_),
+        ", trying to set error: ",
+        tryRetrieveErrorMessageInternal(eptr));
+    TORCH_INTERNAL_ASSERT(!completed(), "Future is already marked completed");
+    completed_ = true;
+    eptr_ = std::move(eptr);
+
+    std::vector<FutureCallback> cbs;
+    cbs.swap(callbacks_);
+    lock.unlock();
+
+    finished_cv_.notify_all();
+    for (auto& callback : cbs) {
+      invokeCallback(std::move(callback.callback), callback.uses_future);
+    }
+  }
+
+  // Tries to retrieve the error message from std::exception_ptr.
+  std::string tryRetrieveErrorMessageInternal(std::exception_ptr eptr) const {
+    try {
+      std::rethrow_exception(std::move(eptr));
+    } catch (const std::exception& e) {
+      return e.what();
+    } catch (...) {
+      return "Unknown Exception Type";
+    }
+  }
+
+  // Defined in ivalue.cpp.
+  static std::vector<WeakStorage> extractStorages(
+      const at::IValue& value);
+
+  static std::vector<c10::Device> getDevicesOfStorages(
+      const c10::impl::VirtualGuardImpl& impl,
+      const std::vector<WeakStorage>& storages) {
+    c10::DeviceIndex deviceCount = impl.deviceCount();
+    std::vector<bool> isDeviceUsed(deviceCount, false);
+    for (const WeakStorage& weak_storage : storages) {
+      c10::intrusive_ptr<c10::StorageImpl> storage = weak_storage.lock();
+      if (!storage) {
+        continue;
+      }
+      c10::Device device = storage->device();
+      if (!device.is_cpu()) {
+        TORCH_CHECK_VALUE(
+            device.type() == impl.type(),
+            "Expected all data ptrs to be on a device of type ",
+            impl.type(),
+            ", got one on device ",
+            device);
+        isDeviceUsed[device.index()] = true;
+      }
+    }
+    std::vector<c10::Device> devices;
+    for (c10::DeviceIndex idx = 0; idx < deviceCount; idx++) {
+      if (isDeviceUsed[idx]) {
+        devices.emplace_back(impl.type(), idx);
+      }
+    }
+    return devices;
+  }
+
+  static std::string formatSetOfDevices(
+      const std::vector<c10::Device>& devices) {
+    if (devices.empty()) {
+      return "(none)";
+    }
+    std::ostringstream oss;
+    oss << devices[0];
+    for (const auto idx : c10::irange(1, devices.size())) {
+      if (idx == devices.size() - 1) {
+        oss << " and ";
+      } else {
+        oss << ", ";
+      }
+      oss << devices[idx];
+    }
+    return oss.str();
+  }
+
+  static c10::DeviceType getTypeOfDevices(
+      const std::vector<c10::Device>& devices) {
+    if (devices.empty()) {
+      return c10::kCPU;
+    }
+    c10::DeviceType deviceType = devices[0].type();
+    for (const auto idx : c10::irange(1, devices.size())) {
+      TORCH_CHECK_VALUE(
+          devices[idx].type() == deviceType,
+          "Expected all devices to be of the same type, but got a mismatch between ",
+          devices[0],
+          " and ",
+          devices[idx]);
+    }
+    return deviceType;
+  }
+
+  // We need devices to be sorted in order to use ensureIsSubsetOfDevices.
+  static std::vector<c10::Device> sortAndDeduplicateDevices(
+      const c10::impl::VirtualGuardImpl& /*impl*/,
+      std::vector<c10::Device> devices) {
+    std::sort(
+      devices.begin(), devices.end(),
+      [](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); });
+    // Deduplicate by compacting.
+    size_t targetIdx = 0;
+    for (const auto sourceIdx : c10::irange(devices.size())) {
+      TORCH_CHECK_VALUE(
+          devices[sourceIdx].has_index(),
+          "Expected devices to have indices, got ", devices[sourceIdx]);
+      if (targetIdx > 0 && devices[targetIdx - 1].index() == devices[sourceIdx].index()) {
+        // It's a duplicate, skip it.
+        continue;
+      }
+      if (sourceIdx != targetIdx) {
+        devices[targetIdx] = devices[sourceIdx];
+      }
+      targetIdx++;
+    }
+    // If there were duplicates there's now a gap at the end: trim it. Resizing
+    // requires the item type to be default-constructible (which c10::Device is
+    // not) because in principle it could be required to create new items. Since
+    // we know we'll shrink the vector, we provide a custom dummy value instead.
+    devices.resize(targetIdx, c10::Device(c10::kCPU));
+    return devices;
+  }
+
+  static void ensureIsSubsetOfDevices(
+      const std::vector<c10::Device>& subset,
+      const std::vector<c10::Device>& superset) {
+    // We assume the devices in both vectors have the same consistent type, and
+    // their indices are unique and sorted.
+    std::vector<c10::Device> excessDevices;
+    std::set_difference(
+        subset.begin(),
+        subset.end(),
+        superset.begin(),
+        superset.end(),
+        std::back_inserter(excessDevices),
+        [](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); });
+    TORCH_CHECK_VALUE(
+        excessDevices.empty(),
+        "The result contained tensors residing on device(s) ",
+        formatSetOfDevices(excessDevices),
+        " which are not among the expected device(s) ",
+        formatSetOfDevices(superset));
+  }
+
+  mutable std::mutex mutex_;
+  std::atomic_bool completed_ = {false}; // is this future complete
+  std::condition_variable finished_cv_;
+
+  IValue value_; // when finished the value
+  TypePtr type_;
+  std::vector<FutureCallback> callbacks_;
+  std::exception_ptr eptr_;
+
+  // An upcast pointer to a virtual class which allows us to manipulate events,
+  // streams, ... in a generic way, without an explicit dependency on CUDA.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const c10::impl::VirtualGuardImpl impl_;
+
+  // The device that was current when markCompleted was called, which we'll
+  // restore when invoking callbacks. It's optional because we'll only store it
+  // if the future completes successfully.
+  std::optional<c10::Device> currentDevice_;
+
+  // The events that correspond to the completion of the async I/O kernels. They
+  // are recorded on the appropriate streams when the future is marked completed
+  // and can then be queried/waited/blocked on. There is one event for each
+  // distinct device on which the value's tensors reside.
+  std::vector<c10::Event> events_;
+
+  // A cached version of the storages extracted from the value when the future
+  // is first marked completed.
+  std::vector<WeakStorage> storages_;
+
+  // The bounding set of devices that this future, and any of its children, is
+  // allowed to use. This is a superset of the set of devices used by the events
+  // above. We need this to know what streams (for which devices) to set as
+  // current when invoking a callback, thus allowing the callback to use devices
+  // that the parent future didn't use. This field is set to the value provided
+  // in the constructor and will be "inherited" by all child futures.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::vector<c10::Device> devices_;
+};
+
+struct C10_EXPORT ivalue::Await final : c10::intrusive_ptr_target {
+ private:
+  explicit Await(TypePtr elType, std::function<IValue()> fn)
+      : elType_(std::move(elType)), type_(AwaitType::create(elType_)), fn_(std::move(fn)) {}
+
+  explicit Await(TypePtr elType) : elType_(std::move(elType)), type_(AwaitType::create(elType_)) { }
+
+  friend c10::intrusive_ptr<Await>;
+
+ public:
+  Await(const Await&) = delete;
+  Await(Await&&) = delete;
+  Await& operator=(const Await&) = delete;
+  Await& operator=(Await&&) = delete;
+
+  IValue wait() {
+    if (!completed_) {
+      TORCH_CHECK(fn_, "Incompleted Await: fn can't be None");
+      value_ = fn_();
+      completed_ = true;
+      args_ = {};
+    }
+    return value_;
+  }
+
+  IValue value() {
+    TORCH_CHECK(completed_, "Await must be completed");
+    return value_;
+  }
+
+  void setFn(std::function<IValue()> fn) {
+    fn_ = std::move(fn);
+  }
+
+  bool completed() {
+    return completed_;
+  }
+
+  void markCompleted(IValue value) {
+    value_ = std::move(value);
+    completed_ = true;
+  }
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const Await& v);
+
+  const TypePtr& elementType() const {
+    return elType_;
+  }
+
+  const TypePtr& type() const {
+    return type_;
+  }
+
+  void setArgs(std::vector<IValue> args) {
+    args_ = std::move(args);
+  }
+
+  std::vector<IValue>& args() {
+    return args_;
+  }
+
+ private:
+  TypePtr elType_;
+  TypePtr type_;
+  std::vector<IValue> args_;
+  std::function<IValue()> fn_;
+  IValue value_;
+  bool completed_{};
+};
+
+// Input is a list of Futures with the same target type.
+// Output is a Future to the List of completed Futures.
+TORCH_API intrusive_ptr<ivalue::Future> collectAll(
+    const c10::List<c10::intrusive_ptr<ivalue::Future>>& srcs);
+// Input is a List of Futures with the same target type.
+// Output is a Future that will be updated with a seen value.
+TORCH_API intrusive_ptr<ivalue::Future> collectAny(
+    const c10::List<c10::intrusive_ptr<ivalue::Future>>& srcs);
+
+// User-defined object.
+struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
+ public:
+  // In general, class types hold a shared_ptr to its owning CompilationUnit,
+  // so that its type and methods do not get deallocated while the class exists.
+  // However, the CompilationUnit holds ownership of the type's graphs, so
+  // inserting a constant object into a Graph would create a reference cycle if
+  // that constant object held a shared_ptr to its CU. For these objects we
+  // instatiate them with non-owning references to its CU
+  Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
+    slots_.resize(numSlots);
+  }
+
+  Object(StrongTypePtr type, size_t numSlots)
+      : type_(WeakOrStrongTypePtr(std::move(type))) {
+    slots_.resize(numSlots);
+  }
+
+  static c10::intrusive_ptr<Object> create(
+      WeakOrStrongTypePtr type,
+      size_t numSlots) {
+    return c10::make_intrusive<Object>(std::move(type), numSlots);
+  }
+
+  static c10::intrusive_ptr<Object> create(
+      StrongTypePtr type,
+      size_t numSlots) {
+    return c10::make_intrusive<Object>(std::move(type), numSlots);
+  }
+
+  static c10::intrusive_ptr<Object> create(ClassTypePtr classType, size_t numSlots);
+
+  /**
+   * Slot API.
+   *
+   * Attributes are stored as a simple vector so that lookups are fast at
+   * runtime. A "slot" is just an index into that vector, which can be computed
+   * statically if you have access to the class type. Use this API if you are
+   * writing compiler stuff.
+   */
+  void setSlot(size_t slot, IValue v) {
+    if (slot >= slots_.size()) {
+      // for module types, it is possible that the members of the class have
+      // expanded after the object was created. In this case, we expand
+      // the slots to the right size
+      resizeObject(slot);
+    }
+    slots_[slot] = std::move(v);
+  }
+
+  const IValue& getSlot(size_t slot) const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(slot < slots_.size());
+    // NOTE: This lookup is fairly hot, so we use unchecked access to the
+    // vector.  Errors should still be detectable with ASan.
+    return slots_[slot];
+  }
+
+  void unsafeRemoveSlot(size_t slot) {
+    TORCH_CHECK(slot < slots_.size());
+    slots_.erase(slots_.begin() + static_cast<std::ptrdiff_t>(slot));
+  }
+
+  /**
+   * Attribute API.
+   *
+   * Wrappers around the slot stuff so that users can access attributes
+   * directly. Use this API if you are a user.
+   *
+   * Note: Unlike in Python, TorchScript must make a distinction between
+   * attributes (which are IValues) and methods (which are Methods). If you
+   * want a method, use `obj.type()->getMethod()`
+   */
+  IValue getAttr(const std::string& name) const;
+  void setAttr(const std::string& name, IValue v);
+  // Remove attribute by name, caller is responsible for
+  // the safety of this operation
+  // We didn't remove the attribute in the type because the type
+  // might be shared by multiple objects.
+  // Therefore after removing attribute, the object is in an inconsistent
+  // state where it has more attribute types in its Type than
+  // the attribute slots it has, user needs to make sure the object
+  // has consistent by removing the attribute in type as well
+  void unsafeRemoveAttr(const std::string& name);
+
+  std::string name() const;
+
+  const std::vector<IValue>& slots() const {
+    return slots_;
+  }
+  std::shared_ptr<ClassType> type() const;
+
+  std::shared_ptr<torch::jit::CompilationUnit> compilation_unit() {
+    if (type_.holds_strong_ref()) {
+      return type_.cu_.getStrongRefOrThrow();
+    } else {
+      auto weak_ptr = type_.cu_.getWeakRefOrThrow();
+      return std::shared_ptr<torch::jit::CompilationUnit>(weak_ptr);
+    }
+  }
+
+  c10::intrusive_ptr<Object> copy_to_weak_compilation_ref() const;
+
+  void unsafe_make_weak_compilation_ref() {
+    type_ = WeakOrStrongTypePtr(type_.asWeakTypePtr());
+  }
+
+  c10::intrusive_ptr<Object> copy() const;
+
+  c10::intrusive_ptr<Object> deepcopy(
+      std::optional<at::Device> device = std::nullopt) const;
+
+  c10::intrusive_ptr<Object> deepcopy(
+      IValue::HashIdentityIValueMap& memo,
+      std::optional<at::Device> device = std::nullopt) const;
+
+  bool is_weak_compilation_ref() const {
+    return !type_.holds_strong_ref();
+  }
+
+  bool is_empty_strong_compilation_ref() const {
+    return type_.holds_empty_strong_ref();
+  }
+
+ private:
+  void resizeObject(size_t slot);
+  WeakOrStrongTypePtr type_;
+  std::vector<IValue> slots_;
+};
+
+// virtual ivalue PyObjectHolder that hold a py::object, we make this virtual
+// because the py::object and refcounting logic should happen in libtorch_python
+// see concrete implementation in python_ivalue.h
+struct ivalue::PyObjectHolder : c10::intrusive_ptr_target {
+ public:
+  virtual PyObject* getPyObject() = 0;
+  virtual c10::InferredType tryToInferType() = 0;
+  virtual IValue toIValue(const TypePtr& type, std::optional<int32_t> N = std::nullopt) = 0;
+  virtual std::string toStr() = 0;
+  virtual std::vector<at::Tensor> extractTensors() = 0;
+
+  ~PyObjectHolder() override = default;
+};
+
+struct ivalue::EnumHolder : c10::intrusive_ptr_target {
+ public:
+  EnumHolder(std::shared_ptr<EnumType> type, std::string name, IValue value)
+      : type_(std::move(type)),
+        name_(std::move(name)),
+        value_(std::move(value)) {}
+
+  bool is(const ivalue::EnumHolder& rhs) {
+    return *this == rhs;
+  }
+
+  friend bool operator==(
+      const ivalue::EnumHolder& lhs,
+      const ivalue::EnumHolder& rhs);
+
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& out,
+      const ivalue::EnumHolder& v);
+
+  TORCH_API const std::string& qualifiedClassName() const;
+
+  const std::string& unqualifiedClassName() const;
+
+  const std::string& name() const {
+    return name_;
+  }
+
+  const IValue& value() const {
+    return value_;
+  }
+
+  std::shared_ptr<EnumType> type() const {
+    return type_;
+  }
+
+ private:
+  std::shared_ptr<EnumType> type_;
+  std::string name_;
+  IValue value_;
+};
+
+#undef TORCH_FORALL_TAGS
+
+namespace detail {
+
+struct _guarded_unsigned_long_unique_dummy final {
+  _guarded_unsigned_long_unique_dummy(int64_t){};
+};
+using _guarded_unsigned_long = std::conditional_t<
+    std::is_same_v<unsigned long, uint32_t> ||
+        std::is_same_v<unsigned long, uint64_t>,
+    _guarded_unsigned_long_unique_dummy,
+    unsigned long>;
+
+} // namespace detail
+
+inline ivalue::Object& IValue::toObjectRef() const {
+  AT_ASSERT(isObject(), "Expected Object but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), "Attempted to create null reference");
+  return *static_cast<c10::ivalue::Object*>(payload.u.as_intrusive_ptr);
+}
+
+// note: when adding a DEFINE_TO case here you should also add a
+// toX method to IValue. These named methods are much more discoverable
+// than the to templated function.
+
+#define DEFINE_TO(T, method_name)                          \
+  template <>                                              \
+  inline T IValue::to<T>()&& {                             \
+    return static_cast<T>(std::move(*this).method_name()); \
+  }                                                        \
+  template <>                                              \
+  inline c10::detail::ivalue_to_const_ref_overload_return<T>::type IValue::to<T>() const& { \
+    typedef c10::detail::ivalue_to_const_ref_overload_return<T>::type return_type;          \
+    return static_cast<return_type>(this->method_name());                                   \
+  }
+
+DEFINE_TO(at::Tensor, toTensor)
+DEFINE_TO(at::Storage, toStorage)
+DEFINE_TO(c10::Stream, toStream)
+DEFINE_TO(float, toDouble)
+DEFINE_TO(double, toDouble)
+DEFINE_TO(c10::complex<double>, toComplexDouble)
+DEFINE_TO(unsigned char, toInt)
+DEFINE_TO(signed char, toInt)
+DEFINE_TO(unsigned short, toInt)
+DEFINE_TO(short, toInt)
+DEFINE_TO(int, toInt)
+DEFINE_TO(uint32_t, toInt)
+DEFINE_TO(uint64_t, toInt)
+DEFINE_TO(detail::_guarded_unsigned_long, toInt)
+DEFINE_TO(int64_t, toInt)
+DEFINE_TO(bool, toBool)
+DEFINE_TO(c10::intrusive_ptr<caffe2::Blob>, toBlob);
+DEFINE_TO(c10::intrusive_ptr<ivalue::ConstantString>, toString)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Object>, toObject)
+DEFINE_TO(at::Scalar, toScalar)
+DEFINE_TO(c10::List<int64_t>, toIntList)
+DEFINE_TO(c10::List<double>, toDoubleList)
+DEFINE_TO(c10::List<c10::complex<double>>, toComplexDoubleList)
+DEFINE_TO(c10::List<bool>, toBoolList)
+DEFINE_TO(c10::List<at::Tensor>, toTensorList)
+DEFINE_TO(c10::impl::GenericList, toList)
+DEFINE_TO(c10::impl::GenericDict, toGenericDict)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Tuple>, toTuple)
+DEFINE_TO(std::string, toStringRef)
+DEFINE_TO(c10::string_view, toStringView)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Future>, toFuture)
+DEFINE_TO(c10::intrusive_ptr<ivalue::Await>, toAwait)
+DEFINE_TO(c10::intrusive_ptr<c10::RRefInterface>, toRRef)
+DEFINE_TO(c10::intrusive_ptr<at::Quantizer>, toQuantizer)
+DEFINE_TO(IValue, toIValue)
+DEFINE_TO(c10::Device, toDevice)
+DEFINE_TO(at::ScalarType, toScalarType)
+DEFINE_TO(at::Layout, toLayout)
+DEFINE_TO(at::MemoryFormat, toMemoryFormat)
+DEFINE_TO(at::QScheme, toQScheme)
+DEFINE_TO(at::Dimname, toDimname)
+DEFINE_TO(at::Generator, toGenerator)
+DEFINE_TO(c10::SymInt, toSymInt)
+DEFINE_TO(c10::SymFloat, toSymFloat)
+DEFINE_TO(c10::SymBool, toSymBool)
+
+template <class T>
+struct _fake_type {};
+
+// generic_to<T> converts an IValue from a generic list or generic dict
+// to a concrete list/dict type likelike List<T>, Dict<...> or std::optional<T>.
+// Note that in the case of lists, this only works for IValue-based lists,
+// i.e. not for int64_t, double, ...
+// generic_to<T> is an implementation detail of IValue::to<T> and not
+// supposed to be called directly.
+// The _fake_type<T> parameter allows us to overload
+// based on the return type.
+template <class Elem>
+// TODO this is deprecated but we don't throw a warning because a lot of ops in
+// native_functions.yaml still return std::vector.
+// C10_DEPRECATED_MESSAGE("IValues based on std::vector<T> are potentially slow
+// and deprecated. Please use torch::List<T> instead.")
+std::vector<Elem> generic_to(IValue ivalue, _fake_type<std::vector<Elem>>) {
+  // We need to do a deep copy of the vector because there might be other
+  // references to this same IValue that also use the list. We can't just
+  // move the elements out.
+  auto list = std::move(ivalue).to<List<Elem>>();
+  std::vector<Elem> result;
+  result.reserve(list.size());
+  for (Elem v : list) {
+    result.push_back(std::move(v));
+  }
+  return result;
+}
+
+template <typename T>
+c10::intrusive_ptr<T> IValue::toCustomClass() && {
+  static_assert(
+      std::is_base_of<torch::CustomClassHolder, T>::value == true,
+      "toCustomClass requires that template parameter T must inherit "
+      "from torch::CustomClassHolder");
+  auto obj = toObject();
+  TORCH_CHECK(
+      obj->slots().size() == 1,
+      "Tried to cast IValue to custom class but it did "
+      "not contain a custom class!");
+  const auto* expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>().get();
+  ivalue::checkCustomClassType(expected_type, type().get());
+  auto userObj =
+      c10::static_intrusive_pointer_cast<T>(obj->getSlot(0).toCapsule());
+  return userObj;
+}
+
+template <typename T>
+c10::intrusive_ptr<T> IValue::toCustomClass() const& {
+  static_assert(
+      std::is_base_of<torch::CustomClassHolder, T>::value == true,
+      "toCustomClass requires that template parameter T must inherit "
+      "from torch::CustomClassHolder");
+  auto obj = toObject();
+  TORCH_CHECK(
+      obj->slots().size() == 1,
+      "Tried to cast IValue to custom class but it did "
+      "not contain a custom class!");
+  const auto* expected_type = c10::getCustomClassType<c10::intrusive_ptr<T>>().get();
+  ivalue::checkCustomClassType(expected_type, type().get());
+  auto userObj =
+      c10::static_intrusive_pointer_cast<T>(obj->getSlot(0).toCapsule());
+  return userObj;
+}
+
+template <typename T>
+T generic_to(IValue ivalue, _fake_type<T>) {
+  using ElemType = typename std::remove_pointer<T>::type::element_type;
+  return std::move(ivalue).toCustomClass<ElemType>();
+}
+
+template <typename T>
+tagged_capsule<T> generic_to(IValue ivalue, _fake_type<tagged_capsule<T>>) {
+  return tagged_capsule<T>{std::move(ivalue)};
+}
+
+template <typename Elem>
+c10::List<Elem> generic_to(IValue ivalue, _fake_type<c10::List<Elem>>) {
+  return impl::toTypedList<Elem>(std::move(ivalue).toList());
+}
+
+template <typename T>
+static T createVectorLikeFromList(const c10::detail::ListImpl* impl) {
+  T result;
+  result.reserve(impl->list.size());
+  for (const auto & i : impl->list) {
+    result.push_back(i.to<typename T::value_type>());
+  }
+  return result;
+}
+
+template <typename T>
+static std::vector<T> createVectorFromList(const c10::detail::ListImpl* impl) {
+  return createVectorLikeFromList<std::vector<T>>(impl);
+}
+
+template <typename T>
+std::vector<T> createVectorFromList(const c10::List<T>& impl) {
+  std::vector<T> result;
+  result.reserve(impl.size());
+  for (size_t i = 0, N = impl.size(); i < N; ++i) {
+    result.push_back(impl[i]);
+  }
+  return result;
+}
+
+template <typename T>
+OptionalArray<T> generic_to(IValue ivalue, _fake_type<OptionalArray<T>>) {
+  if (ivalue.isNone()) {
+    return {};
+  }
+  return createVectorFromList<T>(
+    std::move(ivalue).to<c10::List<T>>()
+  );
+}
+
+namespace detail {
+template <typename Elem, size_t... I>
+std::array<Elem, sizeof...(I)> generic_to_array(
+    IValue ivalue,
+    _fake_type<std::array<Elem, sizeof...(I)>>,
+    std::index_sequence<I...>) {
+  // We need to do a deep copy of the array because there might be other
+  // references to this same IValue that also use the list. We can't just
+  // move the elements out.
+  auto list = std::move(ivalue).to<List<Elem>>();
+  TORCH_CHECK(
+      list.size() == sizeof...(I),
+      "Tried to convert a List with ",
+      list.size(),
+      " elements to a fixed-size array of size ",
+      sizeof...(I));
+  return {list[I]...};
+}
+} // namespace detail
+
+template <typename Elem, size_t N>
+std::array<Elem, N> generic_to(
+    IValue ivalue,
+    _fake_type<std::array<Elem, N>> ft) {
+  return detail::generic_to_array(ivalue, ft, std::make_index_sequence<N>());
+}
+
+template <typename Key, typename Value>
+c10::Dict<Key, Value> generic_to(
+    IValue ivalue,
+    _fake_type<c10::Dict<Key, Value>>) {
+  return impl::toTypedDict<Key, Value>(std::move(ivalue).toGenericDict());
+}
+
+template <typename K, typename V>
+C10_DEPRECATED_MESSAGE(
+    "IValues based on std::unordered_map are slow and deprecated. Please use c10::Dict<K, V> instead.")
+std::unordered_map<K, V> generic_to(
+    IValue ivalue,
+    _fake_type<std::unordered_map<K, V>>) {
+  std::unordered_map<K, V> specialized_dict;
+
+  for (const auto& item : std::move(ivalue).toGenericDict()) {
+    specialized_dict[item.key().template to<K>()] = item.value().template to<V>();
+  }
+
+  return specialized_dict;
+}
+
+template <typename T>
+std::optional<T> generic_to(IValue ivalue, _fake_type<std::optional<T>>) {
+  if (ivalue.isNone()) {
+    return std::nullopt;
+  }
+  return std::move(ivalue).to<T>();
+}
+
+namespace detail {
+template <typename Tuple, std::size_t... INDEX>
+Tuple generic_to_tuple_impl(
+    const ivalue::TupleElements& t,
+    std::index_sequence<INDEX...>) {
+  return std::make_tuple(
+      t[INDEX].to<typename std::tuple_element<INDEX, Tuple>::type>()...);
+}
+} // namespace detail
+
+template <
+    typename... Args,
+    typename Indices = std::make_index_sequence<sizeof...(Args)>,
+    std::enable_if_t<
+        !std::disjunction_v<
+            std::is_lvalue_reference<Args>...,
+            std::negation<std::is_constructible<IValue, Args>>...>,
+        std::nullptr_t> = nullptr>
+std::tuple<Args...> generic_to(const IValue& ivalue, _fake_type<std::tuple<Args...>>) {
+  const auto& vals = ivalue.toTupleRef().elements();
+  TORCH_CHECK(vals.size() == sizeof...(Args));
+  return detail::generic_to_tuple_impl<std::tuple<Args...>>(vals, Indices{});
+}
+
+template <typename T>
+inline T IValue::to() && {
+  return generic_to(std::move(*this), _fake_type<T>{});
+}
+
+template <>
+inline std::optional<c10::string_view> IValue::to() && {
+  // In the default implementation, the IValue is destroyed with std::move.
+  // But if the unboxed type is std::optional<string_view> we cannot destroy
+  // the IValue.
+  return generic_to(*this, _fake_type<std::optional<c10::string_view>>{});
+}
+
+template <typename T>
+inline typename c10::detail::ivalue_to_const_ref_overload_return<T>::type IValue::to() const& {
+  return generic_to(*this, _fake_type<T>{});
+}
+
+inline c10::List<int64_t> IValue::toIntList() && {
+  AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  return c10::List<int64_t>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<int64_t> IValue::toIntList() const& {
+  AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  return c10::List<int64_t>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<int64_t> IValue::toIntVector() const {
+  AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toIntVector on null intrusive_ptr IValue");
+  return createVectorFromList<int64_t>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline std::vector<c10::SymInt> IValue::toSymIntVector() const {
+  AT_ASSERT(isSymIntList() || isIntList(), "Expected SymIntList or IntList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toSymIntVector on null intrusive_ptr IValue");
+  return createVectorFromList<c10::SymInt>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline at::DimVector IValue::toDimVector() const {
+  AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toDimVector on null intrusive_ptr IValue");
+  return createVectorLikeFromList<at::DimVector>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<double> IValue::toDoubleList() && {
+  AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
+  return c10::List<double>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<double> IValue::toDoubleList() const& {
+  AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
+  return c10::List<double>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<double> IValue::toDoubleVector() const {
+  AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toDoubleVector on null intrusive_ptr IValue");
+  return createVectorFromList<double>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<c10::complex<double>> IValue::toComplexDoubleList() && {
+  AT_ASSERT(isComplexDoubleList(), "Expected ComplexDoubleList but got ", tagKind());
+  return c10::List<c10::complex<double>>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<c10::complex<double>> IValue::toComplexDoubleList() const& {
+  AT_ASSERT(isComplexDoubleList(), "Expected ComplexDoubleList but got ", tagKind());
+  return c10::List<c10::complex<double>>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<c10::complex<double>> IValue::toComplexDoubleVector() const {
+  AT_ASSERT(isComplexDoubleList(), "Expected ComplexDoubleList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toComplexDoubleVector on null intrusive_ptr IValue");
+  return createVectorFromList<c10::complex<double>>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<bool> IValue::toBoolList() && {
+  AT_ASSERT(isBoolList(), "Expected BoolList but got ", tagKind());
+  return c10::List<bool>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<bool> IValue::toBoolList() const& {
+  AT_ASSERT(isBoolList(), "Expected BoolList but got ", tagKind());
+  return c10::List<bool>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<at::Tensor> IValue::toTensorList() && {
+  AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind());
+  return c10::List<at::Tensor>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<at::Tensor> IValue::toTensorList() const& {
+  AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind());
+  return c10::List<at::Tensor>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<at::Tensor> IValue::toTensorVector() const {
+  AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toTensorVector on null intrusive_ptr IValue");
+  return createVectorFromList<at::Tensor>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<std::optional<at::Tensor>> IValue::toOptionalTensorList() && {
+  AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
+  return c10::List<std::optional<at::Tensor>>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<std::optional<at::Tensor>> IValue::toOptionalTensorList() const& {
+  AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
+  return c10::List<std::optional<at::Tensor>>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline std::vector<std::optional<at::Tensor>> IValue::toOptionalTensorVector() const {
+  AT_ASSERT(isOptionalTensorList(), "Expected OptionalTensorList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toOptionalTensorVector on null intrusive_ptr IValue");
+  return createVectorFromList<std::optional<at::Tensor>>(
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
+}
+inline c10::List<IValue> IValue::toList() && {
+  AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
+  return c10::List<IValue>(moveToIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::List<IValue> IValue::toList() const& {
+  AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
+  return c10::List<IValue>(toIntrusivePtr<c10::detail::ListImpl>());
+}
+inline c10::ArrayRef<IValue> IValue::toListRef() const {
+  AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toListRef on null intrusive_ptr IValue");
+  return static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr)
+      ->list;
+}
+inline c10::Dict<IValue, IValue> IValue::toGenericDict() && {
+  AT_ASSERT(isGenericDict(), "Expected GenericDict but got ", tagKind());
+  return c10::Dict<IValue, IValue>(moveToIntrusivePtr<c10::detail::DictImpl>());
+}
+inline c10::Dict<IValue, IValue> IValue::toGenericDict() const& {
+  AT_ASSERT(isGenericDict(), "Expected GenericDict but got ", tagKind());
+  return c10::Dict<IValue, IValue>(toIntrusivePtr<c10::detail::DictImpl>());
+}
+inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() && {
+  AT_ASSERT(isTuple(), "Expected Tuple but got ", tagKind());
+  return moveToIntrusivePtr<ivalue::Tuple>();
+}
+inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() const& {
+  AT_ASSERT(isTuple(), "Expected Tuple but got ", tagKind());
+  return toIntrusivePtr<ivalue::Tuple>();
+}
+inline ivalue::Tuple& IValue::toTupleRef() const {
+  AT_ASSERT(isTuple(), "Expected Tuple but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toTupleRef on null intrusive_ptr IValue");
+  return *static_cast<c10::ivalue::Tuple*>(
+      payload.u.as_intrusive_ptr);
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Tuple> v)
+    : tag(Tag::Tuple) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+template <
+    typename... Args,
+    std::enable_if_t<
+        !std::disjunction_v<
+            std::is_lvalue_reference<Args>...,
+            std::negation<std::is_constructible<IValue, Args>>...>,
+        std::nullptr_t>>
+inline IValue::IValue(const std::tuple<Args...>& t)
+    : IValue(c10::guts::apply(c10::ivalue::Tuple::create<const Args&...>, t)) {
+}
+
+template <
+    typename... Args,
+    std::enable_if_t<
+        !std::disjunction_v<
+            std::is_lvalue_reference<Args>...,
+            std::negation<std::is_constructible<IValue, Args>>...>,
+        std::nullptr_t>>
+inline IValue::IValue(std::tuple<Args...>&& t)
+    : IValue(c10::guts::apply(c10::ivalue::Tuple::create<Args&&...>, std::move(t))) {
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::ConstantString> v)
+    : tag(Tag::String) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+inline IValue::IValue(std::string v)
+    : IValue(ivalue::ConstantString::create(std::move(v))) {}
+
+inline IValue::IValue(c10::impl::GenericList v)
+    : tag(Tag::GenericList) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
+}
+
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(c10::List<T>&& v) : IValue(impl::toList<T>(std::move(v))) {}
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(const c10::List<T>& v) : IValue(impl::toList<T>(v)) {}
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(at::ArrayRef<T> v) : IValue(c10::List<T>()) {
+  auto list = to<c10::List<T>>();
+  list.reserve(v.size());
+  for (const auto& e : v) {
+    list.push_back(e);
+  }
+}
+template <class T, IValue::enable_if_symint<T>>
+inline IValue::IValue(at::ArrayRef<T> v) : IValue() {
+  auto vi = c10::asIntArrayRefSlowOpt(v);
+  if (vi.has_value()) {
+    // This list is entirely integers; ensure it is typed as
+    // an IntList so toIntList works
+    *this = IValue(*vi);
+  } else {
+    // This list has SymInts; type it as a SymInt
+    *this = IValue(impl::toList<c10::SymInt>(c10::List<c10::SymInt>()));
+    auto list = to<c10::List<c10::SymInt>>();
+    list.reserve(v.size());
+    for (const auto& e : v) {
+      list.push_back(e);
+    }
+  }
+}
+template <class T, IValue::enable_if_symint<T>>
+inline IValue::IValue(at::OptionalArrayRef<T> mb_v) : IValue() {
+  if (!mb_v.has_value()) return;
+  *this = IValue(*mb_v);
+}
+template <class T, IValue::enable_if_symint<T>>
+inline IValue::IValue(const std::vector<T>& v) : IValue() {
+  *this = IValue(at::ArrayRef<T>(v));
+}
+template <class T, IValue::enable_if_symint<T>>
+inline IValue::IValue(std::vector<T>&& v) : IValue() {
+  auto vi = c10::asIntArrayRefSlowOpt(v);
+  if (vi.has_value()) {
+    // This list is entirely integers; ensure it is typed as
+    // an IntList so toIntList works
+    *this = IValue(*vi);
+  } else {
+    // This list has SymInts; type it as a SymInt
+    *this = IValue(impl::toList<c10::SymInt>(c10::List<c10::SymInt>()));
+    auto list = to<c10::List<c10::SymInt>>();
+    list.reserve(v.size());
+    for (auto&& e : std::move(v)) {
+      list.push_back(std::move(e));
+    }
+  }
+}
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(const std::vector<T>& v) : IValue(c10::List<T>()) {
+  auto list = to<c10::List<T>>();
+  list.reserve(v.size());
+  for (const auto& e : v) {
+    list.push_back(e);
+  }
+}
+
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(std::vector<T>&& v) : IValue(c10::List<T>()) {
+  auto list = to<c10::List<T>>();
+  list.reserve(v.size());
+  if constexpr (std::is_same_v<T, bool>) {
+    for (auto e : v) {
+      list.push_back(e);
+    }
+  } else {
+    for (auto&& e : std::move(v)) {
+      list.push_back(std::move(e));
+    }
+  }
+}
+
+template <class T, IValue::enable_if_list_is_ivalue_constructible<T>>
+inline IValue::IValue(c10::OptionalArrayRef<T> v) : IValue() {
+  if (v.has_value()) {
+    *this = IValue(std::move(*v));
+  }
+}
+
+template <class T, size_t N>
+inline IValue::IValue(std::array<T, N> v) : IValue(c10::List<T>()) {
+  auto list = to<c10::List<T>>();
+  list.reserve(v.size());
+  for (auto& e : v) {
+    list.push_back(std::move(e));
+  }
+}
+
+template <class T, IValue::enable_if_ilist_is_ivalue_constructible<T>>
+inline IValue::IValue(c10::IListRef<T> v) : IValue() {
+  constexpr bool boxed_type_constructs_ivalue =
+      std::is_constructible<IValue, typename c10::IListRef<T>::boxed_type>::value;
+  // First, we try to use the boxed value.
+  // If we fail (either it's not in the boxed state, or its boxed type
+  // can not construct an IValue), we fallback to copying the list.
+  if (boxed_type_constructs_ivalue && v.isBoxed()) {
+    *this = IValue(impl::toList(v.toBoxed()));
+  } else {
+    c10::List<T> list;
+    list.reserve(v.size());
+    for (const auto& t : v) {
+      list.push_back(t);
+    }
+    *this = IValue(impl::toList(std::move(list)));
+  }
+}
+
+inline IValue::IValue(c10::impl::GenericDict v)
+    : tag(Tag::GenericDict) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
+}
+template <class Key, class Value>
+inline IValue::IValue(c10::Dict<Key, Value> v)
+    : IValue(impl::toGenericDict(std::move(v))) {}
+
+template <class Key, class Value>
+inline IValue::IValue(std::unordered_map<Key, Value> v)
+    : IValue(Dict<Key, Value>()) {
+  auto dict = to<c10::Dict<Key, Value>>();
+  dict.reserve(v.size());
+  for (auto& e : v) {
+    dict.insert(std::move(e.first), std::move(e.second));
+  }
+}
+
+template <class T, IValue::enable_if_ivalue_constructible<T>>
+inline IValue::IValue(std::optional<T> v) : IValue() {
+  if (v.has_value()) {
+    *this = IValue(std::move(*v));
+  }
+}
+
+inline IValue::IValue(std::nullopt_t) : IValue() {}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
+    : tag(Tag::Object) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::PyObjectHolder> v)
+    : tag(Tag::PyObject) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::EnumHolder> v)
+    : tag(Tag::Enum) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue IValue::make_capsule(
+    intrusive_ptr<torch::CustomClassHolder> blob) {
+  IValue iv;
+  iv.tag = Tag::Capsule;
+  iv.payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release());
+  return iv;
+}
+
+template <
+    typename T,
+    std::enable_if_t<std::is_base_of_v<torch::CustomClassHolder, T>, int>>
+IValue::IValue(c10::intrusive_ptr<T> custom_class) : tag(Tag::Object) {
+  auto classType = []() {
+    try {
+      return c10::getCustomClassType<c10::intrusive_ptr<T>>();
+    } catch (const c10::Error&) {
+      throw c10::Error(
+          "Trying to instantiate a class that isn't a registered custom class: " +
+          std::string(c10::util::get_fully_qualified_type_name<T>()));
+    }
+  }();
+  auto ivalue_obj = c10::ivalue::Object::create(std::move(classType), /* numSlots */1);
+  ivalue_obj->setSlot(0, IValue::make_capsule(std::move(custom_class)));
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(ivalue_obj.release());
+
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Future> v)
+    : tag(Tag::Future) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<ivalue::Await> v)
+    : tag(Tag::Await) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<c10::RRefInterface> v)
+    : tag(Tag::RRef) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+inline IValue::IValue(c10::intrusive_ptr<at::Quantizer> v)
+    : tag(Tag::Quantizer) {
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
+}
+
+template <typename T>
+inline IValue::IValue(c10::complex<T> c)
+    : tag(Tag::ComplexDouble) {
+  auto v = c10::make_intrusive<ivalue::ComplexHolder>(c);
+  payload.u.as_intrusive_ptr = v.release();
+}
+
+inline const std::string& IValue::toStringRef() const {
+  AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toStringRef on null intrusive_ptr IValue");
+  return static_cast<const c10::ivalue::ConstantString*>(
+             payload.u.as_intrusive_ptr)
+      ->string();
+}
+inline std::optional<std::reference_wrapper<const std::string>> IValue::
+    toOptionalStringRef() const {
+  if (isNone()) {
+    return std::nullopt;
+  }
+  AT_ASSERT(isString(), "Expected std::optional<string> but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toOptionalStringRef on null intrusive_ptr IValue");
+  return std::reference_wrapper<const std::string>(
+      static_cast<const c10::ivalue::ConstantString*>(payload.u.as_intrusive_ptr)
+          ->string());
+}
+
+inline c10::string_view IValue::toStringView() const {
+  AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toStringView on null intrusive_ptr IValue");
+  return static_cast<const c10::ivalue::ConstantString*>(
+        payload.u.as_intrusive_ptr)
+    ->string_view();
+}
+
+inline PyObject* IValue::toPyObject() const {
+  return toPyObjectHolder()->getPyObject();
+}
+
+template <typename T>
+inline std::optional<T> IValue::toOptional() {
+  if (this->isNone()) {
+    return std::nullopt;
+  }
+  return this->to<T>();
+}
+
+template <typename T>
+inline std::optional<T> IValue::toOptional() const {
+  if (this->isNone()) {
+    return std::nullopt;
+  }
+  return this->to<T>();
+}
+
+inline bool IValue::isCustomClass() const {
+  return torch::isCustomClass(*this);
+}
+
+inline bool IValue::isSameIdentity(const IValue& rhs) const {
+  // We choose to not use memcmp for payload check due to potential random
+  // padding characters on union type
+
+  // Semantics:
+  // 1. Immutable primitive values of the same type (Int, Double, None, Bool,
+  // Str) return value equality
+  // 2. If it is a tensor type, we need to take undefined tensor into account
+  // 3. Undefined_tensor is None and vice versa should be true
+  // 4. If it is a reference type (i.e. isIntrusivePtr()), then is True when
+  // the pointed-to object is the same.
+  // 5. False for all other comparisons.
+  if (this->isNone() && rhs.isNone()) {
+    return true;
+  } else if (this->isBool() && rhs.isBool()) {
+    // for bool type, do equality check
+    return this->toBool() == rhs.toBool();
+  } else if (this->isTensor() && rhs.isTensor()) {
+    return this->payload.as_tensor.is_same(rhs.payload.as_tensor);
+  } else if (this->isTensor() && rhs.isNone()) {
+    // special case: undefined tensor and None are the same identity
+    return !this->payload.as_tensor.defined();
+  } else if (this->isNone() && rhs.isTensor()) {
+    // special case: undefined tensor and None are the same identity
+    return !rhs.payload.as_tensor.defined();
+  } else if (this->isInt() && rhs.isInt()) {
+    return this->toInt() == rhs.toInt();
+  } else if (this->isDouble() && rhs.isDouble()) {
+    return this->toDouble() == rhs.toDouble();
+  } else if (this->isString() && rhs.isString()) {
+    return this->toStringRef() == rhs.toStringRef();
+  } else {
+    // for objects holding in IValue, do shallow compare on pointer address to
+    // testify the identity
+    return this->isIntrusivePtr() && rhs.isIntrusivePtr() &&
+        this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
+  }
+}
+
+namespace ivalue {
+namespace detail {
+
+template <typename T>
+IValue from_(T&& x, std::true_type) {
+  return IValue(std::forward<T>(x));
+}
+template <typename T>
+IValue from_(c10::intrusive_ptr<T> x, std::false_type) {
+  return IValue(std::move(x));
+}
+template <typename T>
+IValue from_(T&& /*x*/, std::false_type) {
+  static_assert(
+      guts::false_t<T>::value,
+      "You are calling from with a type that it doesn't support, and isn't a potential custom class (ie: is an intrusive_ptr)");
+  return IValue();
+}
+} // namespace detail
+
+template <typename T>
+IValue from(T&& x) {
+  return detail::from_(
+      std::forward<T>(x), typename std::is_constructible<IValue, T>::type{});
+}
+
+} // namespace ivalue
+
+
+template <>
+struct MaybeOwnedTraits<IValue> {
+  using owned_type = IValue;
+  using borrow_type = IValue;
+
+  static borrow_type createBorrow(const owned_type& from) {
+    if (!from.isPtrType()) {
+      return from;
+    }
+    if (from.isTensor()) {
+      return IValue(MaybeOwnedTraits<at::Tensor>::createBorrow(from.toTensor()));
+    } else {
+      return IValue(from.payload, from.tag);
+    }
+  }
+
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.clearToNone();
+    if (!rhs.isPtrType()) {
+      lhs = rhs;
+    } else if (rhs.isTensor()) {
+      lhs = IValue(MaybeOwnedTraits<at::Tensor>::createBorrow(rhs.toTensor()));
+    } else {
+      lhs = IValue(rhs.payload, rhs.tag);
+    }
+  }
+
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.clearToNone();
+  }
+
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+
+  static bool debugBorrowIsValid(const borrow_type&) {
+    return true;
+  }
+};
+
+template <>
+struct IValue::TagType<c10::Type> {
+  static TORCH_API c10::TypePtr get(const IValue&);
+};
+
+template <>
+struct IValue::TagType<c10::DynamicType> {
+  static TORCH_API c10::TypePtr get(const IValue&);
+};
+
+template <typename T>
+TypePtr IValue::type() const {
+  return IValue::TagType<T>::get(*this);
+}
+
+} // namespace c10
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/rref_interface.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/rref_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0749d368792f0e8b90710c614f802f59909a43f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/rref_interface.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <c10/util/intrusive_ptr.h>
+#include <ATen/core/jit_type_base.h>
+
+namespace c10 {
+
+struct Type;
+using worker_id_t = int16_t;
+
+// This abstract class contains only user-facing APIs, and will be shared
+// between jit and distributed to implement TorchScript support.
+class C10_EXPORT RRefInterface : public c10::intrusive_ptr_target {
+ public:
+  RRefInterface() = default;
+  // RRef is made NOT copyable NOT movable to prevent messing up reference
+  // counting.
+  RRefInterface(const RRefInterface& other) = delete;
+  RRefInterface(RRefInterface&& other) = delete;
+  RRefInterface& operator=(RRefInterface&& other) = delete;
+
+  ~RRefInterface() override = default;
+
+  // returns the worker id of the owner
+  virtual worker_id_t owner() const = 0;
+
+  // returns the worker name of the owner
+  virtual std::string ownerName() const = 0;
+
+  // Returns true if this is the ``OwnerRRef``
+  virtual bool isOwner() const = 0;
+
+  // Returns true if this is an ``OwnerRRef`` or if this ``UserRRef`` has been
+  // confirmed by its owner.
+  virtual bool confirmedByOwner() const = 0;
+
+  virtual const TypePtr type() const = 0;
+};
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/stack.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/stack.h
new file mode 100644
index 0000000000000000000000000000000000000000..6372a3ccb556fe3b2285eb267b94e28bb6e2ea48
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/stack.h
@@ -0,0 +1,204 @@
+#pragma once
+
+#include <type_traits>
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/irange.h>
+
+// TODO move this to c10 namespace
+
+
+namespace torch::jit {
+
+using c10::IValue;
+using Stack = std::vector<IValue>;
+
+class Operation {
+  template <typename F, typename Arg>
+  using accepts = std::is_constructible<std::function<void(Arg)>, F&&>;
+
+ public:
+  template <typename F,
+            std::enable_if_t<accepts<F, Stack*>::value, int> = 0>
+  C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.")
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  Operation(F&& raw): op_([raw = std::forward<F>(raw)](Stack& stack) {
+    raw(&stack);
+  }) {}
+
+  template <typename F,
+            std::enable_if_t<accepts<F, Stack&>::value &&
+                !std::is_same_v<std::decay_t<F>, Operation>, int> = 0>
+  Operation(F&& op): op_(std::forward<F>(op)) {}
+
+  Operation(std::nullptr_t) noexcept {}
+
+  explicit operator bool() const noexcept {
+    return op_ ? true : false;
+  }
+
+  void operator()(Stack& stack) {
+    op_(stack);
+  }
+
+  template <typename T>
+  T* target() noexcept {
+    return op_.target<T>();
+  }
+
+ private:
+  std::function<void(Stack&)> op_;
+};
+
+// An operation with N inputs and M outputs pops the last N inputs off
+// the stack and pushes its M inputs onto the stack
+// before: <other stack items> I0, I1, ... IN <- stack.back()
+// after: <other stack items> O0, O1, ... OM
+// operations are defined this way so that ownership of inputs can be
+// transferred to the operation and it can incrementally drop ownership of
+// tensors when they become unneeded. For large operations, like 'run an entire
+// subgraph', this functionality is very important for minimizing gpu memory
+// usage return value is the relative 'offset' to jump to for the next
+// operation:
+//   pc += 1 + offset
+// so a return value of 0 goes to the next instruction
+
+// treat the last N elements of the stack as a list, looking up
+// element i
+inline IValue& peek(Stack& stack, size_t i, size_t N) {
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
+  return *(stack.end() - N + i);
+}
+inline IValue& peek(Stack* stack, size_t i, size_t N) {
+  return peek(*stack, i, N);
+}
+inline const IValue& peek(const Stack& stack, size_t i, size_t N) {
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
+  return *(stack.end() - N + i);
+}
+inline const IValue& peek(const Stack* stack, size_t i, size_t N) {
+  return peek(*stack, i, N);
+}
+// treat the last N elements of the stack as a list, looking up the
+// slice starting at index i and having length len
+inline at::ArrayRef<IValue> peekSlice(
+    const Stack& stack,
+    size_t i,
+    size_t len,
+    size_t N) {
+  return at::ArrayRef<IValue>(stack).slice(stack.size() - N + i, len);
+}
+inline at::ArrayRef<IValue> last(const Stack& stack, size_t N) {
+  return peekSlice(stack, 0, N, N);
+}
+inline at::ArrayRef<IValue> last(const Stack* stack, size_t N) {
+  return last(*stack, N);
+}
+inline void drop(Stack& stack, size_t n) {
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions)
+  stack.erase(stack.end() - n, stack.end());
+}
+inline void drop(Stack* stack, size_t n) {
+  drop(*stack, n);
+}
+inline IValue pop(Stack& stack) {
+  auto r = std::move(stack.back());
+  stack.pop_back();
+  return r;
+}
+inline IValue pop(Stack* stack) {
+  return pop(*stack);
+}
+inline std::vector<IValue> pop(Stack& stack, size_t n) {
+  std::vector<IValue> result;
+  result.reserve(n);
+  for (const auto i : c10::irange(n)) {
+    result.push_back(std::move(peek(stack, i, n)));
+  }
+  drop(stack, n);
+  return result;
+}
+
+// variadic pop:
+// int64_t a; at::Tensor b;
+// pop(stack, a, b);
+// equivalent to:
+// b = pop(stack).toTensor();
+// a = pop(stack).toInt();
+template <typename... Types>
+inline void pop(Stack& stack, Types&... args) {
+  size_t i = 0;
+  constexpr size_t N = sizeof...(args);
+  (void)std::initializer_list<int>{
+      (args = std::move(peek(stack, i++, N)).template to<Types>(), 0)...};
+  drop(stack, N);
+}
+template <typename... Types>
+inline void pop(Stack* stack, Types&... args) {
+  pop(*stack, args...);
+}
+template <typename Type>
+inline void push_one(Stack& stack, Type&& arg) {
+  stack.emplace_back(std::forward<Type>(arg));
+}
+
+inline void push_one(Stack& stack, c10::TensorOptions options) {
+  stack.emplace_back(c10::typeMetaToScalarType(options.dtype()));
+  stack.emplace_back(options.layout());
+  stack.emplace_back(options.device());
+  stack.emplace_back(options.pinned_memory());
+}
+
+template <typename... Types>
+inline void push(Stack& stack, Types&&... args) {
+  (void)std::initializer_list<int>{(push_one(stack, std::forward<Types>(args)), 0)...};
+}
+template <typename... Types>
+inline void push(Stack* stack, Types&&... args) {
+  return push(*stack, std::forward<Types>(args)...);
+}
+template <class T>
+inline void push_list_elements(Stack& stack, const c10::List<T>& elements) {
+  for (T elem : elements) {
+    stack.push_back(std::move(elem));
+  }
+}
+
+// The packer here is carefully written not to make any unnecessary
+// copies.
+
+// pack takes the return values of aten functions pushes them onto the stack
+template <typename T>
+inline void pack(Stack& stack, T&& v) {
+  stack.emplace_back(std::forward<T>(v));
+}
+template <typename T>
+inline void pack(Stack* stack, T&& v) {
+  pack(*stack, std::forward<T>(v));
+}
+
+template <std::size_t remaining, typename... Args>
+struct TuplePacker {
+  // NB: *Not* a universal reference.
+  static void execute(Stack& stack, std::tuple<Args...>&& t) {
+    // NB: The move here does not "destroy" the entire tuple, that is
+    // not what std::move does; only the particular tuple index
+    // processed here gets stolen.
+    pack(stack, std::get<sizeof...(Args) - remaining>(std::move(t)));
+    TuplePacker<remaining - 1, Args...>::execute(stack, std::move(t));
+  }
+};
+
+template <typename... Args>
+struct TuplePacker<0, Args...> {
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  static void execute(Stack& /*stack*/, std::tuple<Args...>&& /*t*/){};
+};
+
+template <typename... Args>
+inline void pack(Stack& stack, std::tuple<Args...>&& t) {
+  TuplePacker<sizeof...(Args), Args...>::execute(stack, std::move(t));
+}
+
+} // namespace torch::jit
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/core/type_factory.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/type_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..8592a8864d64e4db14a5eefda85a9e4ad02dc27c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/core/type_factory.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <type_traits>
+#include <unordered_map>
+
+#include <ATen/core/dynamic_type.h>
+#include <ATen/core/jit_type_base.h>
+#include <c10/macros/Macros.h>
+
+namespace c10 {
+
+template <typename T>
+struct TORCH_API TypeFactoryBase {};
+
+template <>
+struct TORCH_API TypeFactoryBase<c10::DynamicType> {
+  template <typename T, typename... Args>
+  static c10::DynamicTypePtr create(TypePtr ty, Args&&... args) {
+    return std::make_shared<c10::DynamicType>(
+        c10::DynamicTypeTrait<T>::tagValue(),
+        c10::DynamicType::Arguments(c10::ArrayRef<c10::TypePtr>(
+            {std::move(ty), std::forward<Args>(args)...})));
+  }
+  template <typename T>
+  static c10::DynamicTypePtr create(const std::vector<c10::TypePtr>& types) {
+    return std::make_shared<c10::DynamicType>(
+        c10::DynamicTypeTrait<T>::tagValue(),
+        c10::DynamicType::Arguments(types));
+  }
+  static c10::DynamicTypePtr createNamedTuple(
+      const std::string& name,
+      const std::vector<c10::string_view>& fields,
+      const std::vector<c10::TypePtr>& types) {
+    return std::make_shared<c10::DynamicType>(
+        c10::DynamicType::Tag::Tuple,
+        name,
+        c10::DynamicType::Arguments(fields, types));
+  }
+  template <typename T>
+  C10_ERASE static c10::DynamicTypePtr createNamed(const std::string& name) {
+    return std::make_shared<c10::DynamicType>(
+        c10::DynamicTypeTrait<T>::tagValue(),
+        name,
+        c10::DynamicType::Arguments{});
+  }
+  template <typename T>
+  C10_ERASE static c10::DynamicTypePtr get() {
+    return DynamicTypeTrait<T>::getBaseType();
+  }
+  static const std::unordered_map<std::string, c10::TypePtr>& basePythonTypes();
+};
+
+using DynamicTypeFactory = TypeFactoryBase<c10::DynamicType>;
+
+// Helper functions for constructing DynamicTypes inline.
+template <
+    typename T,
+    std::enable_if_t<DynamicTypeTrait<T>::isBaseType, int> = 0>
+C10_ERASE DynamicTypePtr dynT() {
+  return DynamicTypeFactory::get<T>();
+}
+
+template <
+    typename T,
+    typename... Args,
+    std::enable_if_t<!DynamicTypeTrait<T>::isBaseType, int> = 0>
+C10_ERASE DynamicTypePtr dynT(Args&&... args) {
+  return DynamicTypeFactory::create<T>(std::forward<Args>(args)...);
+}
+
+template <>
+struct TORCH_API TypeFactoryBase<c10::Type> {
+  template <typename T, typename... Args>
+  static c10::TypePtr create(TypePtr ty, Args&&... args) {
+    return T::create(std::move(ty), std::forward<Args>(args)...);
+  }
+  template <typename T>
+  static c10::TypePtr create(std::vector<c10::TypePtr> types) {
+    return T::create(std::move(types));
+  }
+  static c10::TypePtr createNamedTuple(
+      const std::string& name,
+      const std::vector<c10::string_view>& fields,
+      const std::vector<c10::TypePtr>& types);
+  template <typename T>
+  C10_ERASE static c10::TypePtr createNamed(const std::string& name) {
+    return T::create(name);
+  }
+  static const std::unordered_map<std::string, c10::TypePtr>& basePythonTypes();
+  template <typename T>
+  C10_ERASE static c10::TypePtr get() {
+    return T::get();
+  }
+};
+
+using DefaultTypeFactory = TypeFactoryBase<c10::Type>;
+
+using PlatformType =
+#ifdef C10_MOBILE
+    c10::DynamicType
+#else
+    c10::Type
+#endif
+    ;
+
+using TypeFactory = TypeFactoryBase<PlatformType>;
+
+} // namespace c10
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_batch_norm_with_update_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_batch_norm_with_update_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..613fed3c16e921d07c08c974b21bc14fa6e1a668
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_batch_norm_with_update_native.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _batch_norm_with_update_functional(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _batch_norm_with_update_cpu_out(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, at::Tensor & reserve);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _batch_norm_with_update_cpu(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _batch_norm_with_update_cuda_out(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, at::Tensor & reserve);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _batch_norm_with_update_cuda(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _batch_norm_with_update_mkldnn(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_coalesced_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_coalesced_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..3538a667af6b5e324115d09aea940e06284e75bb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_coalesced_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor _coalesced(const at::Tensor & self, bool coalesced);
+TORCH_API at::Tensor & _coalesced_out(const at::Tensor & self, bool coalesced, at::Tensor & out);
+TORCH_API at::Tensor & _coalesced_sparse_(at::Tensor & self, bool coalesced);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_cpu_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7919cbfb5047e478ed1b13e9b53e98ba2f09e98
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_cpu_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor _convert_weight_to_int4pack(const at::Tensor & self, int64_t innerKTiles);
+
+} // namespace cpu
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cudnn_rnn_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cudnn_rnn_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1267b7894c52ef4de0d0f7751802854f57cdd5d5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cudnn_rnn_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _cudnn_rnn_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const ::std::optional<at::Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const ::std::optional<at::Tensor> & dropout_state);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _cudnn_rnn_outf(const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const ::std::optional<at::Tensor> & cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const ::std::optional<at::Tensor> & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _cudnn_rnn_symint_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const ::std::optional<at::Tensor> & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional<at::Tensor> & dropout_state);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _cudnn_rnn_symint_outf(const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional<at::Tensor> & weight_buf, const at::Tensor & hx, const ::std::optional<at::Tensor> & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional<at::Tensor> & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_dimI.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_dimI.h
new file mode 100644
index 0000000000000000000000000000000000000000..debf7569d95bf4b9664c6fe1c65d65f08bb4230b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_dimI.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/_dimI_ops.h>
+
+namespace at {
+
+
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bbf3531834c870b8d7558ba42a18ec5f029547a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _embedding_bag_per_sample_weights_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1);
+TORCH_API at::Tensor & _embedding_bag_per_sample_weights_backward_outf(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..efddcc5990bed08e2c0f40525cdf374219ef6f13
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward.h
@@ -0,0 +1,47 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/_embedding_bag_sparse_backward_ops.h>
+
+namespace at {
+
+
+// aten::_embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+inline at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_sparse_backward::call(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_sparse_backward::call(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+  }
+}
+
+// aten::_embedding_bag_sparse_backward(Tensor grad, Tensor indices, Tensor offsets, Tensor offset2bag, Tensor bag_size, SymInt num_weights, bool scale_grad_by_freq, int mode, Tensor? per_sample_weights, int padding_idx=-1) -> Tensor
+inline at::Tensor _embedding_bag_sparse_backward_symint(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_sparse_backward::call(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1) {
+    return at::_ops::_embedding_bag_sparse_backward::call(grad, indices, offsets, offset2bag, bag_size, num_weights, scale_grad_by_freq, mode, per_sample_weights, padding_idx);
+  }
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_max.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_max.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc683dc29966b3c0f8178c5f8d8a4a0070a8ffbd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_max.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/_foreach_max_ops.h>
+
+namespace at {
+
+
+// aten::_foreach_max(Tensor[] self) -> Tensor[]
+inline ::std::vector<at::Tensor> _foreach_max(at::TensorList self) {
+    return at::_ops::_foreach_max::call(self);
+}
+
+// aten::_foreach_max.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_max_out(at::TensorList out, at::TensorList self) {
+    return at::_ops::_foreach_max_out::call(self, out);
+}
+// aten::_foreach_max.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_max_outf(at::TensorList self, at::TensorList out) {
+    return at::_ops::_foreach_max_out::call(self, out);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_round_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_round_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd51138e08a6a8928ee316d8e064fe8d506309ff
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_round_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_round(at::TensorList self);
+TORCH_API void _foreach_round_(at::TensorList self);
+
+} // namespace cuda
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lu_with_info.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lu_with_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6e15dca7daffe6b74d5418913dbc6c0ba04b1c5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lu_with_info.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/_lu_with_info_ops.h>
+
+namespace at {
+
+
+// aten::_lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _lu_with_info(const at::Tensor & self, bool pivot=true, bool check_errors=true) {
+    return at::_ops::_lu_with_info::call(self, pivot, check_errors);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_scale_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_scale_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6370180e6de0d79865cd7782f387e710164a82a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_scale_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _masked_scale_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, double scale);
+TORCH_API at::Tensor & _masked_scale_outf(const at::Tensor & self, const at::Tensor & mask, double scale, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..a21798c8b6852fb83af1764d4d4c476c6cb1a88a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & _masked_softmax_out(const at::Tensor & self, const at::Tensor & mask, ::std::optional<int64_t> dim, ::std::optional<int64_t> mask_type, at::Tensor & out);
+TORCH_API at::Tensor masked_softmax_cpu(const at::Tensor & self, const at::Tensor & mask, ::std::optional<int64_t> dim=::std::nullopt, ::std::optional<int64_t> mask_type=::std::nullopt);
+TORCH_API at::Tensor masked_softmax_cuda(const at::Tensor & self, const at::Tensor & mask, ::std::optional<int64_t> dim=::std::nullopt, ::std::optional<int64_t> mask_type=::std::nullopt);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_from_padded_and_nested_example_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_from_padded_and_nested_example_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0510927a120e8a6353faf3fa05542c42510e6893
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_from_padded_and_nested_example_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _nested_from_padded_and_nested_example_out(at::Tensor & out, const at::Tensor & padded, const at::Tensor & nt_example);
+TORCH_API at::Tensor & _nested_from_padded_and_nested_example_outf(const at::Tensor & padded, const at::Tensor & nt_example, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_get_jagged_dummy_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_get_jagged_dummy_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..03961d4b89644a45fb66c90c752d1dca72443cec
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_get_jagged_dummy_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _nested_get_jagged_dummy {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_nested_get_jagged_dummy")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_nested_get_jagged_dummy(Tensor any) -> Tensor")
+  static at::Tensor call(const at::Tensor & any);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & any);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_pack_padded_sequence_backward_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_pack_padded_sequence_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4de1e7beaa5c6a4b8a0f0281cffc6b6e1db4757
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_pack_padded_sequence_backward_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor _pack_padded_sequence_backward_symint(const at::Tensor & grad, c10::SymIntArrayRef input_size, const at::Tensor & batch_sizes, bool batch_first);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1809f2b804e00a901feca794959bc05304ff3a8f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _sparse_coo_tensor_with_dims {
+  using schema = at::Tensor (int64_t, int64_t, at::IntArrayRef, ::std::optional<at::ScalarType>, ::std::optional<at::Layout>, ::std::optional<at::Device>, ::std::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_coo_tensor_with_dims")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor")
+  static at::Tensor call(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+};
+
+struct TORCH_API _sparse_coo_tensor_with_dims_out {
+  using schema = at::Tensor & (int64_t, int64_t, at::IntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_coo_tensor_with_dims")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_coo_tensor_with_dims.out(int sparse_dim, int dense_dim, int[] size, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_semi_structured_mm.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_semi_structured_mm.h
new file mode 100644
index 0000000000000000000000000000000000000000..a454c6820905514c51e6714922e96268d7cea10b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_semi_structured_mm.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/_sparse_semi_structured_mm_ops.h>
+
+namespace at {
+
+
+// aten::_sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor
+inline at::Tensor _sparse_semi_structured_mm(const at::Tensor & mat1, const at::Tensor & mat1_meta, const at::Tensor & mat2, ::std::optional<at::ScalarType> out_dtype=::std::nullopt) {
+    return at::_ops::_sparse_semi_structured_mm::call(mat1, mat1_meta, mat2, out_dtype);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_standard_gamma_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_standard_gamma_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c370452cc24fb90efae88cdae979f000efe40760
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_standard_gamma_cuda_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor _standard_gamma(const at::Tensor & self, ::std::optional<at::Generator> generator=::std::nullopt);
+
+} // namespace cuda
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_check_tensor_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_check_tensor_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4562fd2462f6ba75b9840ba7b31999182b79967
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_check_tensor_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _test_check_tensor {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_check_tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_check_tensor(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_compositeimplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfc04c0255489d59b146d6824a65b449d4a3339f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_bsr_tensor_args_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API void _validate_sparse_bsr_tensor_args(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_values_copy_compositeexplicitautogradnonfunctional_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_values_copy_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa460e7de9ee8159633f4e10091b464b36db561d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_values_copy_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor _values_copy(const at::Tensor & self);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/add_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/add_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e19bc4649a705a29bd9aed8c728a384221b05735
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/add_ops.h
@@ -0,0 +1,83 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API add_Tensor {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::add")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha);
+};
+
+struct TORCH_API add__Tensor {
+  using schema = at::Tensor & (at::Tensor &, const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::add_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha);
+};
+
+struct TORCH_API add_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::add")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out);
+};
+
+struct TORCH_API add_Scalar {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::add")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha);
+};
+
+struct TORCH_API add__Scalar {
+  using schema = at::Tensor & (at::Tensor &, const at::Scalar &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::add_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha);
+};
+
+struct TORCH_API add_Scalar_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::add")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arccosh_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arccosh_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..c527d5337556e55f4c3eaf915756df260a8cfda2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arccosh_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API arccosh {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::arccosh")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "arccosh(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API arccosh_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::arccosh_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "arccosh_(Tensor(a!) self) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API arccosh_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::arccosh")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/as_strided_scatter_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/as_strided_scatter_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..59700a20a2692cfc1c857081547c44cb65b26f7a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/as_strided_scatter_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API as_strided_scatter {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, ::std::optional<c10::SymInt>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::as_strided_scatter")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "as_strided_scatter(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<c10::SymInt> storage_offset);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<c10::SymInt> storage_offset);
+};
+
+struct TORCH_API as_strided_scatter_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, ::std::optional<c10::SymInt>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::as_strided_scatter")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "as_strided_scatter.out(Tensor self, Tensor src, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<c10::SymInt> storage_offset, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<c10::SymInt> storage_offset, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_update_stats_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_update_stats_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..d45b54b8d4e5dc7cdcf2bac3a585b1d0f3b2a94a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_update_stats_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> batch_norm_update_stats_out(const at::Tensor & input, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, double momentum, at::Tensor & out0, at::Tensor & out1);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> batch_norm_update_stats_cpu(const at::Tensor & input, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, double momentum);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> batch_norm_update_stats_cuda(const at::Tensor & input, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, double momentum);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ab4bf562101a392da831bf18113de8f585d3ac4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean);
+TORCH_API at::Tensor & binary_cross_entropy_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean);
+TORCH_API at::Tensor & binary_cross_entropy_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/binomial_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/binomial_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..29bf2ce01a6bc7d9f4393deaabe5d17ed55307a1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/binomial_cuda_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor binomial(const at::Tensor & count, const at::Tensor & prob, ::std::optional<at::Generator> generator=::std::nullopt);
+
+} // namespace cuda
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clamp_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clamp_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..477a42efa16f4354fc31b2b9d4abf383c8cdf471
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clamp_native.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/clamp_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_clamp_out : public at::meta::structured_clamp {
+void impl(const at::Tensor & self, at::OptionalScalarRef min, at::OptionalScalarRef max, const at::Tensor & out);
+};
+TORCH_API at::Tensor clamp_quantized_cpu(const at::Tensor & self, const ::std::optional<at::Scalar> & min=::std::nullopt, const ::std::optional<at::Scalar> & max=::std::nullopt);
+struct TORCH_API structured_clamp_Tensor_out : public at::meta::structured_clamp_Tensor {
+void impl(const at::Tensor & self, at::OptionalTensorRef min, at::OptionalTensorRef max, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv3d_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv3d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..53a7e21142c83275eab25e2d4e9b851bef17bf71
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv3d_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor conv3d_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1);
+TORCH_API at::Tensor conv3d_padding_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::string_view padding="valid", c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d04e9214806b006d335ee0fe689a0285f72fcca2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & cudnn_convolution_transpose_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32);
+TORCH_API at::Tensor & cudnn_convolution_transpose_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out);
+TORCH_API at::Tensor & cudnn_convolution_transpose_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32);
+TORCH_API at::Tensor & cudnn_convolution_transpose_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/dropout_compositeimplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/dropout_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..03d108f257bd00a9d7c2126e9fdd6b65d9a93102
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/dropout_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor dropout(const at::Tensor & input, double p, bool train);
+TORCH_API at::Tensor & dropout_(at::Tensor & self, double p, bool train);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/elu_backward_meta.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/elu_backward_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ff300e425a465de62db33545f50fa03a4a2ef9f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/elu_backward_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_elu_backward : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result);
+};
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/erf_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/erf_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..8246f18c253164876c870e4a500b7419f46ff278
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/erf_native.h
@@ -0,0 +1,29 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/erf_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_erf_out : public at::meta::structured_erf {
+void impl(const at::Tensor & self, const at::Tensor & out);
+};
+TORCH_API at::Tensor erf_sparse(const at::Tensor & self);
+TORCH_API at::Tensor & erf_sparse_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & erf_sparse_(at::Tensor & self);
+TORCH_API at::Tensor erf_sparse_csr(const at::Tensor & self);
+TORCH_API at::Tensor & erf_sparse_csr_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & erf_sparse_csr_(at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/expm1.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/expm1.h
new file mode 100644
index 0000000000000000000000000000000000000000..d730175b7e3b6769028ceb4cc3b3f8c486b086d0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/expm1.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/expm1_ops.h>
+
+namespace at {
+
+
+// aten::expm1(Tensor self) -> Tensor
+inline at::Tensor expm1(const at::Tensor & self) {
+    return at::_ops::expm1::call(self);
+}
+
+// aten::expm1_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & expm1_(at::Tensor & self) {
+    return at::_ops::expm1_::call(self);
+}
+
+// aten::expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & expm1_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::expm1_out::call(self, out);
+}
+// aten::expm1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & expm1_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::expm1_out::call(self, out);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fft_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fft_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe1d98dc2e07758fc56efefde813383bae209ecf
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fft_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API fft_fft {
+  using schema = at::Tensor (const at::Tensor &, ::std::optional<c10::SymInt>, int64_t, ::std::optional<c10::string_view>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_fft")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_fft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, ::std::optional<c10::SymInt> n, int64_t dim, ::std::optional<c10::string_view> norm);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional<c10::SymInt> n, int64_t dim, ::std::optional<c10::string_view> norm);
+};
+
+struct TORCH_API fft_fft_out {
+  using schema = at::Tensor & (const at::Tensor &, ::std::optional<c10::SymInt>, int64_t, ::std::optional<c10::string_view>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_fft")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_fft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, ::std::optional<c10::SymInt> n, int64_t dim, ::std::optional<c10::string_view> norm, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional<c10::SymInt> n, int64_t dim, ::std::optional<c10::string_view> norm, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fftshift_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fftshift_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..edf2de936dbdc31a13ab0bf77124f8a3b8d5c3e9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fftshift_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API fft_fftshift {
+  using schema = at::Tensor (const at::Tensor &, at::OptionalIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_fftshift")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, at::OptionalIntArrayRef dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_ifftshift_compositeimplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_ifftshift_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd94234a10263e427d0c20b57ea1d94d0db036ca
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_ifftshift_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor fft_ifftshift(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_ifftshift_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_ifftshift_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..15b13d3c8be92b81481e88fb09c39d31d749ea9e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_ifftshift_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor fft_ifftshift(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa4c5cda20b0fc5e44cbc8e9cc58f649c0e09e88
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> grid_sampler_2d_backward_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> grid_sampler_2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask, at::Tensor & out0, at::Tensor & out1);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..390c841110012ab2ac993e8131db088095d45929
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/grid_sampler_2d_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & grid_sampler_2d_out(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, at::Tensor & out);
+TORCH_API at::Tensor grid_sampler_2d_cpu(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+TORCH_API at::Tensor grid_sampler_2d_cuda(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/grid_sampler_3d_cpu_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/grid_sampler_3d_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8e065ba16645a2adbfaf5fdacc11a80d9dec72e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/grid_sampler_3d_cpu_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor grid_sampler_3d(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+} // namespace cpu
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/le_meta.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/le_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb34613cb1c0b8235c5351985c7ac6252f79b64e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/le_meta.h
@@ -0,0 +1,32 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_le_Scalar : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self, const at::Scalar & other);
+};
+struct TORCH_API structured_le_Tensor : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self, const at::Tensor & other);
+};
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/leaky_relu.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/leaky_relu.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f2fb34f8925daaff93f92ba00d2a134b665f145
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/leaky_relu.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/leaky_relu_ops.h>
+
+namespace at {
+
+
+// aten::leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & leaky_relu_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & negative_slope=0.01) {
+    return at::_ops::leaky_relu_out::call(self, negative_slope, out);
+}
+// aten::leaky_relu.out(Tensor self, Scalar negative_slope=0.01, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & leaky_relu_outf(const at::Tensor & self, const at::Scalar & negative_slope, at::Tensor & out) {
+    return at::_ops::leaky_relu_out::call(self, negative_slope, out);
+}
+
+// aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
+inline at::Tensor leaky_relu(const at::Tensor & self, const at::Scalar & negative_slope=0.01) {
+    return at::_ops::leaky_relu::call(self, negative_slope);
+}
+
+// aten::leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
+inline at::Tensor & leaky_relu_(at::Tensor & self, const at::Scalar & negative_slope=0.01) {
+    return at::_ops::leaky_relu_::call(self, negative_slope);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool3d_with_indices_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool3d_with_indices_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..597459df1e8d0099a79baaab8e3bb01040d3ef34
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool3d_with_indices_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> max_pool3d_with_indices(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> max_pool3d_with_indices_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> max_pool3d_with_indices_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices);
+
+} // namespace cuda
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu.h
new file mode 100644
index 0000000000000000000000000000000000000000..f21fc13c068f16295ab8124ff15011861c196ca8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu.h
@@ -0,0 +1,47 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/miopen_convolution_add_relu_ops.h>
+
+namespace at {
+
+
+// aten::miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+inline at::Tensor miopen_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const ::std::optional<at::Scalar> & alpha, const ::std::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
+    return at::_ops::miopen_convolution_add_relu::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor miopen_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const ::std::optional<at::Scalar> & alpha, const ::std::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
+    return at::_ops::miopen_convolution_add_relu::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+  }
+}
+
+// aten::miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
+inline at::Tensor miopen_convolution_add_relu_symint(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const ::std::optional<at::Scalar> & alpha, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+    return at::_ops::miopen_convolution_add_relu::call(self, weight, z, alpha, bias, stride, padding, dilation, groups);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor miopen_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const ::std::optional<at::Scalar> & alpha, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
+    return at::_ops::miopen_convolution_add_relu::call(self, weight, z, alpha, bias, stride, padding, dilation, groups);
+  }
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_input_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_input_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f02141c732f08c96c37d63ecd946525ba28ca96a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_input_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API mkldnn_linear_backward_input {
+  using schema = at::Tensor (at::IntArrayRef, const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::mkldnn_linear_backward_input")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "mkldnn_linear_backward_input(int[] input_size, Tensor grad_output, Tensor weight) -> Tensor")
+  static at::Tensor call(at::IntArrayRef input_size, const at::Tensor & grad_output, const at::Tensor & weight);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef input_size, const at::Tensor & grad_output, const at::Tensor & weight);
+};
+
+struct TORCH_API mkldnn_linear_backward_input_out {
+  using schema = at::Tensor & (at::IntArrayRef, const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::mkldnn_linear_backward_input")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "mkldnn_linear_backward_input.out(int[] input_size, Tensor grad_output, Tensor weight, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(at::IntArrayRef input_size, const at::Tensor & grad_output, const at::Tensor & weight, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef input_size, const at::Tensor & grad_output, const at::Tensor & weight, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multilabel_margin_loss_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multilabel_margin_loss_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..674829faa68acd87469e1a182d5a0cdc3f0b83de
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multilabel_margin_loss_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API multilabel_margin_loss_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multilabel_margin_loss")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multilabel_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out);
+};
+
+struct TORCH_API multilabel_margin_loss {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multilabel_margin_loss")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & target, int64_t reduction);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm_backward_cpu_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..5318cf8b11ee7c9db25db175b532aa18663f46f7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm_backward_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_symint(const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
+
+} // namespace cpu
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nuclear_norm_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nuclear_norm_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..09ec588d46d0a3173bf14cdeca59ff6cdafe7894
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nuclear_norm_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor nuclear_norm(const at::Tensor & self, bool keepdim=false);
+TORCH_API at::Tensor & nuclear_norm_out(const at::Tensor & self, bool keepdim, at::Tensor & out);
+TORCH_API at::Tensor nuclear_norm(const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false);
+TORCH_API at::Tensor & nuclear_norm_out(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/pin_memory.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/pin_memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f4f167c2221f911e9152098057349e3fea05600
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/pin_memory.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/pin_memory_ops.h>
+
+namespace at {
+
+
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu.h
new file mode 100644
index 0000000000000000000000000000000000000000..deb7264c4ac23feb5bef55b9888bbd843669a7bc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/relu_ops.h>
+
+namespace at {
+
+
+// aten::relu(Tensor self) -> Tensor
+inline at::Tensor relu(const at::Tensor & self) {
+    return at::_ops::relu::call(self);
+}
+
+// aten::relu_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & relu_(at::Tensor & self) {
+    return at::_ops::relu_::call(self);
+}
+
+// aten::relu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & relu_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::relu_out::call(self, out);
+}
+// aten::relu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & relu_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::relu_out::call(self, out);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/roll_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/roll_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a0a072e6abe6eb388e79faa59388ba7171bc7cd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/roll_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor roll(const at::Tensor & self, at::IntArrayRef shifts, at::IntArrayRef dims={});
+TORCH_API at::Tensor roll_symint(const at::Tensor & self, c10::SymIntArrayRef shifts, at::IntArrayRef dims={});
+
+} // namespace cuda
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_transpose3d_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_transpose3d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcd575000e9e0c9cdd930dfd89bfaafb8df1ba90
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_transpose3d_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor slow_conv_transpose3d_cpu(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1);
+TORCH_API at::Tensor & slow_conv_transpose3d_out_cpu(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef dilation, at::Tensor & out);
+TORCH_API at::Tensor slow_conv_transpose3d_cuda(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1);
+TORCH_API at::Tensor & slow_conv_transpose3d_out_cuda(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef dilation, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_airy_ai_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_airy_ai_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc6a04132d83fceadcdd92d1709acbd8f7a72f66
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_airy_ai_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API special_airy_ai {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_airy_ai")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_airy_ai(Tensor x) -> Tensor")
+  static at::Tensor call(const at::Tensor & x);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x);
+};
+
+struct TORCH_API special_airy_ai_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_airy_ai")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_airy_ai.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & x, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_entr_meta.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_entr_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5018e78c62bd2beabcbc1d96c18b13eeb8f13b4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_entr_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_special_entr : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_gammaincc_compositeimplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_gammaincc_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a711d334332bf98dceb790ff82074c48341c4f39
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_gammaincc_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor special_gammaincc(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & special_gammaincc_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & special_gammaincc_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/triu_indices_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/triu_indices_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..5086cc91f95039dfda0ea4f8a02db62176a33fee
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/triu_indices_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & triu_indices_out(int64_t row, int64_t col, int64_t offset, at::Tensor & out);
+TORCH_API at::Tensor triu_indices_cpu(int64_t row, int64_t col, int64_t offset=0, ::std::optional<at::ScalarType> dtype={}, ::std::optional<at::Layout> layout={}, ::std::optional<at::Device> device={}, ::std::optional<bool> pin_memory={});
+TORCH_API at::Tensor triu_indices_cuda(int64_t row, int64_t col, int64_t offset=0, ::std::optional<at::ScalarType> dtype={}, ::std::optional<at::Layout> layout={}, ::std::optional<at::Device> device={}, ::std::optional<bool> pin_memory={});
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_nearest2d_meta_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_nearest2d_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d03fd79c0fe51a2015afa6fc0eb2a08d793e5ec
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_nearest2d_meta_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor upsample_nearest2d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor upsample_nearest2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor & upsample_nearest2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor & upsample_nearest2d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out);
+TORCH_API at::Tensor & upsample_nearest2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor & upsample_nearest2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional<double> scales_h, ::std::optional<double> scales_w, at::Tensor & out);
+
+} // namespace meta
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_backward_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..30f6bc43a3b2ee2ec4f10a4f102cd8c981d508f8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_backward_native.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/upsample_trilinear3d_backward_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_upsample_trilinear3d_backward_out_cpu : public at::meta::structured_upsample_trilinear3d_backward {
+void impl(const at::Tensor & grad_output, at::ArrayRef<int64_t> output_size, at::ArrayRef<int64_t> input_size, bool align_corners, ::std::optional<double> scales_d, ::std::optional<double> scales_h, ::std::optional<double> scales_w, const at::Tensor & grad_input);
+};
+struct TORCH_API structured_upsample_trilinear3d_backward_out_cuda : public at::meta::structured_upsample_trilinear3d_backward {
+void impl(const at::Tensor & grad_output, at::ArrayRef<int64_t> output_size, at::ArrayRef<int64_t> input_size, bool align_corners, ::std::optional<double> scales_d, ::std::optional<double> scales_h, ::std::optional<double> scales_w, const at::Tensor & grad_input);
+};
+} // namespace native
+} // namespace at