lsmpp commited on Oct 16, 2025

Commit

5fa88dc

verified ·

1 Parent(s): 10f1e6a

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.12/site-packages/torch/include/ATen/core/ATenGeneral.h +3 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/ATenOpList.h +13 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/ATen_fwd.h +46 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/ATen_pch.h +161 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Array.h +48 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Backtrace.h +2 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/CachingHostAllocator.h +737 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/CheckMemoryFormat.h +24 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/DeprecatedTypeProperties.h +139 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/DeprecatedTypePropertiesRegistry.h +33 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Dict.h +396 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Dict_inl.h +208 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/DimVector.h +13 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Dimname.h +48 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/DistributionsHelper.h +332 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Formatting.h +25 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Generator.h +191 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/GeneratorForPrivateuseone.h +39 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/IListRef.h +631 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/IListRef_inl.h +203 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/LegacyTypeDispatch.h +111 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/List.h +491 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/List_inl.h +353 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/MT19937RNGEngine.h +194 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/NamedTensor.h +143 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/NestedIntSymNodeImpl.h +187 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/PhiloxRNGEngine.h +240 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/PythonFallbackKernel.h +35 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/PythonOpRegistrationTrampoline.h +22 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/QuantizerBase.h +84 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Range.h +25 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Reduction.h +14 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Scalar.h +1 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/ScalarType.h +1 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Tensor.h +98 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/TensorAccessor.h +275 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/TensorBase.h +1056 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/TensorBody.h +0 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/TorchDispatchUtils.h +17 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/TransformationHelper.h +175 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/UndefinedTensorImpl.h +1 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/UnsafeFromTH.h +21 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/VariableHooksInterface.h +83 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Variadic.h +92 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/Vitals.h +94 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/alias_info.h +162 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/aten_interned_strings.h +2294 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/blob.h +204 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h +213 -0
.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h +106 -0

.venv/lib/python3.12/site-packages/torch/include/ATen/core/ATenGeneral.h ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #pragma once
2	+
3	+ #include <c10/macros/Macros.h>

.venv/lib/python3.12/site-packages/torch/include/ATen/core/ATenOpList.h ADDED Viewed

	@@ -0,0 +1,13 @@

+#pragma once
+#include <c10/macros/Export.h>
+namespace c10 {
+struct OperatorName;
+}
+namespace at {
+// check if an op is a custom op (i.e. did not come from native_functions.yaml)
+TORCH_API bool is_custom_op(const c10::OperatorName& opName);
+}

.venv/lib/python3.12/site-packages/torch/include/ATen/core/ATen_fwd.h ADDED Viewed

	@@ -0,0 +1,46 @@

+#pragma once
+#include <c10/core/QScheme.h>
+// Forward declarations of core ATen types used in dispatch functions
+namespace c10 {
+template<typename T>
+class List;
+template<typename T>
+class IListRef;
+class Stream;
+class Scalar;
+class SymInt;
+class SymIntList;
+struct Storage;
+struct TensorOptions;
+template <typename T>
+class ArrayRef;
+template <typename T>
+class OptionalArrayRef;
+}  // namespace c10
+namespace at {
+class Tensor;
+class OptionalTensorRef;
+struct Dimname;
+struct Generator;
+using TensorList = c10::ArrayRef<Tensor>;
+using ITensorListRef = c10::IListRef<Tensor>;
+using IOptTensorListRef = c10::IListRef<OptionalTensorRef>;
+using DimnameList = c10::ArrayRef<Dimname>;
+using IntArrayRef = c10::ArrayRef<int64_t>;
+using OptionalIntArrayRef = c10::OptionalArrayRef<int64_t>;
+using OptionalSymIntArrayRef = c10::OptionalArrayRef<c10::SymInt>;
+using c10::Stream;
+using c10::Storage;
+using c10::QScheme;
+using c10::Scalar;
+using c10::SymInt;
+using c10::SymIntList;
+using c10::TensorOptions;
+}  // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/ATen_pch.h ADDED Viewed

	@@ -0,0 +1,161 @@

+// This global header must not depend on native_functions.yaml or
+// incremental builds will be next to useless
+#pragma push_macro("TORCH_ASSERT_NO_OPERATORS")
+#define TORCH_ASSERT_NO_OPERATORS
+#include <cinttypes>
+// This list of headers was generated using a script that finds
+// high-impact headers and then manually tweaked to remove OS specific
+// or duplicate headers (e.g. <cassert> and <assert.h>) and to remove
+// "impl" headers (e.g BFloat16-inl.h or complex_math.h in c10).
+// To generate the initial list:
+// 1. Build pytorch from scratch with all build caching disabled
+// 2. Generate a build trace with ninjatracing (https://github.com/nico/ninjatracing)
+//    $ ninjatracing /path/to/pytorch/build/.ninja_log > trace_all.json
+// 3. Run pch_gen.py from https://github.com/peterbell10/build_analysis/
+//    $ python pch_gen.py --threshold .80 --target torch_cpu --build_dir /path/to/pytorch/build --trace trace_all.json
+//    Where the threshold can be tweaked until c10 and some of ATen
+//    core are included but TORCH_ASSERT_NO_OPERATORS still passes.
+#include <cerrno>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <complex>
+#include <deque>
+#include <exception>
+#include <functional>
+#include <initializer_list>
+#include <iomanip>
+#include <iosfwd>
+#include <iterator>
+#include <limits>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <new>
+#include <numeric>
+#include <ostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include <c10/core/Allocator.h>
+#include <c10/core/AutogradState.h>
+#include <c10/core/Backend.h>
+#include <c10/core/DefaultDtype.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/DispatchKey.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/core/GeneratorImpl.h>
+#include <c10/core/InferenceMode.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/OptionalRef.h>
+#include <c10/core/QScheme.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/Storage.h>
+#include <c10/core/StorageImpl.h>
+#include <c10/core/SymBool.h>
+#include <c10/core/SymFloat.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/core/impl/PyInterpreter.h>
+#include <c10/core/impl/SizesAndStrides.h>
+#include <c10/macros/Export.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/AlignOf.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/C++17.h>
+#include <c10/util/ConstexprCrc.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/DimVector.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/Flags.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/FunctionRef.h>
+#include <c10/util/Half.h>
+#include <c10/util/IdWrapper.h>
+#include <c10/util/Logging.h>
+#include <c10/util/MaybeOwned.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/Optional.h>
+#include <c10/util/Registry.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/StringUtil.h>
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <c10/util/Type.h>
+#include <c10/util/TypeCast.h>
+#include <c10/util/TypeIndex.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/TypeSafeSignMath.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/UniqueVoidPtr.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/bit_cast.h>
+#include <c10/util/bits.h>
+#include <c10/util/complex.h>
+#include <c10/util/floating_point_utils.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/irange.h>
+#include <c10/util/llvmMathExtras.h>
+#include <c10/util/python_stub.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint2x4.h>
+#include <c10/util/quint4x2.h>
+#include <c10/util/quint8.h>
+#include <c10/util/safe_numerics.h>
+#include <c10/util/string_utils.h>
+#include <c10/util/string_view.h>
+#include <c10/util/typeid.h>
+#include <ATen/StorageUtils.h>
+#include <ATen/core/ATen_fwd.h>
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <ATen/core/DeprecatedTypePropertiesRegistry.h>
+#include <ATen/core/DimVector.h>
+#include <ATen/core/Dimname.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/QuantizerBase.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/core/TensorBase.h>
+#include <ATen/core/symbol.h>
+#pragma pop_macro("TORCH_ASSERT_NO_OPERATORS")

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Array.h ADDED Viewed

	@@ -0,0 +1,48 @@

+#pragma once
+// A fixed-size array type usable from both host and
+// device code.
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+namespace at::detail {
+template <typename T, int size_>
+struct Array {
+  // NOLINTNEXTLINE(*c-array*)
+  T data[size_];
+  C10_HOST_DEVICE T operator[](int i) const {
+    return data[i];
+  }
+  C10_HOST_DEVICE T& operator[](int i) {
+    return data[i];
+  }
+#if defined(USE_ROCM)
+  C10_HOST_DEVICE Array() = default;
+  C10_HOST_DEVICE Array(const Array&) = default;
+  C10_HOST_DEVICE Array& operator=(const Array&) = default;
+  C10_HOST_DEVICE Array(Array&&) = default;
+  C10_HOST_DEVICE Array& operator=(Array&&) = default;
+  C10_HOST_DEVICE ~Array() = default;
+#else
+  Array() = default;
+  Array(const Array&) = default;
+  Array& operator=(const Array&) = default;
+  Array(Array&&) noexcept = default;
+  Array& operator=(Array&&) noexcept = default;
+  ~Array() = default;
+#endif
+  static constexpr int size() {
+    return size_;
+  }
+  // Fill the array with x.
+  C10_HOST_DEVICE Array(T x) {
+    for (int i = 0; i < size_; i++) {
+      data[i] = x;
+    }
+  }
+};
+} // namespace at::detail

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Backtrace.h ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #include <c10/util/Backtrace.h>
2	+ #include <c10/util/Type.h>

.venv/lib/python3.12/site-packages/torch/include/ATen/core/CachingHostAllocator.h ADDED Viewed

	@@ -0,0 +1,737 @@

+#pragma once
+#include <c10/core/Allocator.h>
+#include <c10/core/Stream.h>
+#include <c10/core/thread_pool.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/llvmMathExtras.h>
+#include <optional>
+#include <deque>
+#include <mutex>
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
+namespace at {
+using c10::CachingAllocator::Stat;
+using c10::CachingAllocator::DurationStat;
+/**
+ * HostBlock is typically a fundamental memory block used in pinned memory. It
+ * is likely related to Event and Stream of device runtime. It is probably a
+ * base struct or interface that can be inherited and extended by each backend.
+ */
+template <typename S>
+struct HostBlock {
+  // constructor for search key
+  HostBlock(size_t size) : size_(size) {}
+  HostBlock(size_t size, void* ptr) : size_(size), ptr_(ptr) {}
+  std::mutex mutex_;
+  size_t size_{0}; // block size in bytes
+  void* ptr_{nullptr}; // memory address
+  bool allocated_{false}; // in-use flag
+  size_t event_count_{0}; // number of related events
+  ska::flat_hash_set<S> streams_; // streams on which the block was used
+};
+template <typename B>
+struct alignas(64) FreeBlockList {
+  std::mutex mutex_;
+  std::deque<B*> list_;
+};
+namespace {
+  // Max cached block sizes: (1 << MAX_SIZE_INDEX) bytes
+  // NOLINTNEXTLINE(misc-definitions-in-headers)
+  constexpr size_t MAX_SIZE_INDEX = 64;
+}
+// Struct containing memory allocator summary statistics for host.
+struct TORCH_API HostStats {
+  // COUNT: allocations requested by client code. Note that active
+  // count can be extracted by looking at current allocations
+  Stat allocation;
+  // COUNT: number of allocated segments from host memory allocation.
+  Stat segment;
+  // SUM: bytes allocated by this memory alocator. Note that active bytes
+  // can be extracted by looking at current bytes allocated
+  Stat allocated_bytes;
+  // SUM: bytes reserved by this memory allocator (both free and used)
+  Stat reserved_bytes;
+  // SUM: time spent in cudaHostAlloc/cudaHostRegister in microseconds
+  DurationStat host_alloc_time;
+  // SUM: time spent in cudaHostFree/cudaHostUnregister in microseconds
+  DurationStat host_free_time;
+  // COUNT: number of times cudaHostAlloc/cudaHostRegister was called because
+  // the request could not be satisfied from existing free blocks.
+  int64_t num_host_alloc = 0; // This is derived from segment or timing
+  // COUNT: number of times cudaHostFree/cudaHostUnregister was called.
+  int64_t num_host_free = 0; // This is derived from segment or timing
+};
+// Struct containing memory allocator summary statistics for host, as they
+// are staged for reporting. This is a temporary struct that is used to
+// avoid locking the allocator while collecting stats.
+struct alignas(64) HostStatsStaged {
+  std::mutex timing_mutex_;
+  // COUNT: allocations requested by client code resulting in a new segment/block allocation
+  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
+  Stat allocation;
+  // SUM: bytes within active memory blocks, including blocks that are
+  // currently in the free list.
+  // LOCK: access to this stat is protected by the allocator's blocks_mutex_
+  Stat allocated_bytes;
+  // COUNT: number of allocations per bucket
+  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
+  std::vector<Stat> allocation_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
+  // SUM: bytes of allocation per bucket
+  // LOCK: access to this stat is protected by the per bucket free_list_[index].mutex_
+  std::vector<Stat> allocated_bytes_bucket_stats = std::vector<Stat>(MAX_SIZE_INDEX);
+  // SUM: time spent in cudaHostAlloc/cudaHostRegister
+  // LOCK: access to this stat is protected by the timing_mutex_
+  DurationStat host_alloc_time;
+  // SUM: time spent in cudaHostFree/cudaHostUnregister
+  // LOCK: access to this stat is protected by the timing_mutex_
+  DurationStat host_free_time;
+};
+/**
+ * Note [HostAllocator design]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * We have three key data structures - the free list which stores blocks that
+ * are not currently used, the block list which stores all blocks that have been
+ * allocated, and the event queue which stores runtime events and their
+ * corresponding blocks.
+ *
+ * Each of these are protected by a separate mutex. The key design principles
+ * are to 1) only hold each mutex for the minimal amount of time possible, 2)
+ * never do any possible expensive operations (such as CUDA runtime API calls)
+ * while holding the lock.
+ *
+ * There are four public methods: allocate, free, record_event and empty_cache.
+ *   1) In the allocate path, we first check to see if we can service our
+ * request from this free list, and otherwise we create a new block with
+ * allocate_host_memory.
+ *   2) In the free path, we insert events (if required) into the event queue,
+ * and if possible insert our block back into the free list. In allocate, we
+ * first eagerly query events until we find one that is not ready, and insert
+ * the corresponding block onto the free list if all the events recorded for a
+ * block are ready.
+ *   3) In the record_event path, we simply insert the given stream into the set
+ * of streams tracked by the specified block. This set of streams is then
+ * consumed in the free path.
+ *   4) In the empty_cache path, we flush any available blocks into the free
+ * list. Remove all element of free list, then remove them from block list and
+ * release the associated pinned memory allocation via free_block.
+ *
+ * We generalize the caching host allocator into two parts: interface and
+ * implementation. For any new backend looking to integrate with host allocator
+ * and reuse caching mechanism, these two parts are necessary to be specialized.
+ *
+ * For the implementation, we provide a CachingHostAllocatorImpl struct
+ * to abstract the caching mechanism. Any backend needs to provide a customized
+ * implementation by specializing its own public functions and the related
+ * runtime functions. Its template parameter S represents runtime Stream, E
+ * denotes runtime Event, B indicates the fundamental memory block.
+ *
+ * For the interface, we provide a CachingHostAllocatorInterface struct as an
+ * interface. Any backend needs to derive its own host allocator from this
+ * interface. Its template parameter T refers to an implementation that
+ * inherited from CachingHostAllocatorImpl.
+ *
+ * So this design can share the caching mechanism across each backend, and
+ * provide flexibility to each backend. A backend can choose to follow this
+ * implementation or reuse them by extending and overriding them as necessary.
+ * Taking CUDA as an example, it specializes runtime related functions to reuse
+ * the caching mechanism. Additionally, it extends the allocator's functionality
+ * by adding the allocWithCudaHostRegister function to support page-locking the
+ * memory range used by CUDA. Of course, you can also refer to
+ * XPUCachingHostAllocator, which is a host caching allocator supported on XPU
+ * backend, to implement a basic host caching allocator.
+ *
+ * Some of the invariants here are less strict than they could be - for example,
+ * we do not enforce that free(Block* block) => block->event_count == 0. This is
+ * for compatibility reasons, and we can explore enforcing these in subsequent
+ * versions.
+ *
+ * Note that this caching host allocator does not split larger allocations into
+ * smaller blocks, unlike the caching device allocator.
+ *
+ * In order to gather statistics about caching host allocator while minimally
+ * impacting performance, we use a HostStatsStaged struct to stage the stats
+ * before reporting them. This is done to avoid adding new locks to the allocator.
+ * Collecting stats is carefully done under existing locks, and then the staged
+ * stats are converted to the final stats when getStats is called. At that time
+ * we hold the same locks as empty_cache, to ensure the fidelity of the stats.
+ */
+template <
+    typename S,
+    typename E,
+    typename B = HostBlock<S>>
+struct CachingHostAllocatorImpl {
+  virtual ~CachingHostAllocatorImpl() {
+    active_ = false;
+    if (pinned_use_background_threads()) {
+      getBackgroundThreadPool()->waitWorkComplete();
+    }
+  }
+ public:
+  // return data_ptr and block pair.
+  virtual std::pair<void*, void*> allocate(size_t size) {
+    if (size == 0) {
+      return {nullptr, nullptr};
+    }
+    // If we are using background threads, we can process events in the
+    // background.
+    if (!pinned_use_background_threads()) {
+      process_events();
+    }
+    // Round up the allocation to the nearest power of two to improve reuse.
+    // These power of two sizes are also used to index into the free list.
+    size_t roundSize = c10::llvm::PowerOf2Ceil(size);
+    // First, try to allocate from the free list
+    auto* block = get_free_block(roundSize);
+    if (block) {
+      return {block->ptr_, reinterpret_cast<void*>(block)};
+    }
+    // Check in the recently freed blocks with pending events to see if we
+    // can reuse them. Call get_free_block again after processing events
+    if (pinned_use_background_threads()) {
+      process_events_for_specific_size(roundSize);
+      block = get_free_block(roundSize);
+      if (block) {
+        return {block->ptr_, reinterpret_cast<void*>(block)};
+      }
+      // Launch the background thread and process events in a loop.
+      static bool background_thread_flag [[maybe_unused]] = [this] {
+        getBackgroundThreadPool()->run([&]() {
+          while (active_) {
+            process_events();
+            std::this_thread::sleep_for(std::chrono::microseconds(100));
+          }
+        });
+        return true;
+      }();
+    }
+    // Slow path: if we can't allocate from the cached free list, we need
+    // to create a new block.
+    void* ptr = nullptr;
+    allocate_host_memory(roundSize, &ptr);
+    // Then, create a new block.
+    block = new B(roundSize, ptr);
+    block->allocated_ = true;
+    add_allocated_block(block);
+    return {block->ptr_, reinterpret_cast<void*>(block)};
+  }
+  virtual void free(void* ctx) {
+    if (!ctx) {
+      return;
+    }
+    // Note: we can assume that free is correctly paired with alloc, and thus we
+    // do not need to look up the ctx in blocks_.
+    auto* block = reinterpret_cast<B*>(ctx);
+    std::optional<std::vector<E>> events;
+    {
+      std::lock_guard<std::mutex> g(block->mutex_);
+      block->allocated_ = false;
+      if (block->streams_.empty()) {
+        TORCH_INTERNAL_ASSERT(block->event_count_ == 0);
+      } else {
+        events = std::vector<E>();
+        events->reserve(block->streams_.size());
+        for (auto stream : block->streams_) {
+          record_stream(events, stream);
+        }
+        block->event_count_ += events->size();
+        block->streams_.clear();
+      }
+    }
+    if (!events) {
+      auto index = size_index(block->size_);
+      std::lock_guard<std::mutex> g(free_list_[index].mutex_);
+      free_list_[index].list_.push_back(block);
+      stats_.allocation_bucket_stats[index].decrease(1);
+      stats_.allocated_bytes_bucket_stats[index].decrease(block->size_);
+    } else {
+      // restore these events that record by used streams.
+      std::lock_guard<std::mutex> g(events_mutex_);
+      for (auto&& event : *events) {
+        events_.emplace_front(std::move(event), block);
+      }
+    }
+  }
+  virtual bool record_event(void* ptr, void* ctx, c10::Stream s) {
+    S stream = S(s);
+    auto* block = reinterpret_cast<B*>(ctx);
+    // Note: we need to check if the passed-in `ctx` is valid. This is because
+    // `record_event` (via `CachingHostAllocator_recordEvent`) can be invoked on
+    // an arbitrary tensor, and is not guaranteed to correspond to a pinned
+    // memory allocation. Therefore, we need to check that `ctx` is valid before
+    // proceeding.
+    {
+      std::lock_guard<std::mutex> g(blocks_mutex_);
+      if (blocks_.find(block) != blocks_.end()) {
+        // Now we know this object is safe to access.
+        std::lock_guard<std::mutex> gb(block->mutex_);
+        TORCH_INTERNAL_ASSERT(block->allocated_);
+        block->streams_.insert(stream);
+        return true;
+      }
+      auto it = ptr_to_block_.find(ptr);
+      if (it != ptr_to_block_.end()) {
+        block = it->second;
+        std::lock_guard<std::mutex> g(block->mutex_);
+        TORCH_INTERNAL_ASSERT(block->allocated_);
+        block->streams_.insert(stream);
+        return true;
+      }
+    }
+    return false;
+  }
+  virtual void empty_cache() {
+    // Flush any available blocks into the free_list.
+    process_events();
+    // Remove all elements from the free list, remove them from the blocks
+    // list, and free the associated pinned memory allocation. This requires
+    // concurrently holding both the free list mutexes and the blocks mutex, and
+    // is the only function that concurrently holds multiple mutexes.
+    for (size_t i = 0; i < free_list_.size(); ++i) {
+      std::lock(free_list_[i].mutex_, blocks_mutex_);
+      std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
+      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
+      std::vector<B*> blocks_to_remove(free_list_[i].list_.begin(), free_list_[i].list_.end());
+      free_list_[i].list_.clear();
+      for (auto* block : blocks_to_remove) {
+        blocks_.erase(block);
+        ptr_to_block_.erase(block->ptr_);
+        stats_.allocation.decrease(1);
+        stats_.allocated_bytes.decrease(block->size_);
+        free_block(block);
+        delete block;
+      }
+    }
+  }
+  inline size_t size_index(size_t size) {
+    return c10::llvm::Log2_64_Ceil(size);
+  }
+  virtual bool pinned_use_background_threads() {
+    return false;
+  }
+  virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for copy_data");
+  }
+  HostStats getStats() {
+    HostStats stats;
+    // To keep getStats lightweight we do *not* flush any available blocks
+    // into the free_list. This may skew the stats a bit.
+    auto add_bucket_stats = [](Stat& accumulator, const Stat& other) {
+      accumulator.allocated += other.allocated;
+      accumulator.current += other.current;
+      accumulator.freed += other.freed;
+      // Since peaks are measured per bucket independently, we add them up
+      // to estimate the total peak. This is not strictly correct, but it is
+      // the best approximation we can get after the fact.
+      accumulator.peak += other.peak;
+    };
+    // Accurate reading of memory stats requires concurrently holding both the
+    // free list mutexes and the blocks mutex. Previously, this was only done in
+    // empty_cache function.
+    for (size_t i = 0; i < free_list_.size(); ++i) {
+      std::lock(free_list_[i].mutex_, blocks_mutex_);
+      std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
+      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
+      // We collect the slow-path stats only once, since they are not collected
+      // per bucket (we pick index 0 arbitrarily). These are also all the host
+      // allocations, not taking into account caching and free lists.
+      if (i == 0) {
+        stats.segment = stats_.allocation;
+        stats.reserved_bytes = stats_.allocated_bytes;
+        stats.num_host_alloc = stats.segment.allocated;
+        stats.num_host_free = stats.segment.freed;
+      }
+      // Bucket stats need to be merged with the slow-path stats. We do this in
+      // a best effort manner, since we can't really replay the cached events per bucket.
+      add_bucket_stats(stats.allocation, stats_.allocation_bucket_stats[i]);
+      add_bucket_stats(stats.allocated_bytes, stats_.allocated_bytes_bucket_stats[i]);
+    }
+    // Get the timing stats
+    {
+      std::lock_guard<std::mutex> g(stats_.timing_mutex_);
+      stats.host_alloc_time = stats_.host_alloc_time;
+      stats.host_free_time = stats_.host_free_time;
+    }
+    return stats;
+  }
+  void resetAccumulatedStats() {
+    // Reseting accumulated memory stats requires concurrently holding both the
+    // free list mutexes and the blocks mutex. Previously, this was only done in
+    // empty_cache function.
+    for (size_t i = 0; i < free_list_.size(); ++i) {
+      std::lock(free_list_[i].mutex_, blocks_mutex_);
+      std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
+      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
+      if (i == 0) {
+        stats_.allocation.reset_accumulated();
+        stats_.allocated_bytes.reset_accumulated();
+      }
+      stats_.allocation_bucket_stats[i].reset_accumulated();
+      stats_.allocated_bytes_bucket_stats[i].reset_accumulated();
+    }
+    // Also reset timing stats
+    {
+      std::lock_guard<std::mutex> g(stats_.timing_mutex_);
+      stats_.host_alloc_time.reset_accumulated();
+      stats_.host_free_time.reset_accumulated();
+    }
+  }
+  void resetPeakStats() {
+    // Reseting peak memory stats requires concurrently holding both the
+    // free list mutexes and the blocks mutex. Previously, this was only done in
+    // empty_cache function.
+    for (size_t i = 0; i < free_list_.size(); ++i) {
+      std::lock(free_list_[i].mutex_, blocks_mutex_);
+      std::lock_guard<std::mutex> gf(free_list_[i].mutex_, std::adopt_lock);
+      std::lock_guard<std::mutex> gb(blocks_mutex_, std::adopt_lock);
+      if (i == 0) {
+        stats_.allocation.reset_peak();
+        stats_.allocated_bytes.reset_peak();
+      }
+      stats_.allocation_bucket_stats[i].reset_peak();
+      stats_.allocated_bytes_bucket_stats[i].reset_peak();
+    }
+    // Also reset timing stats
+    {
+      std::lock_guard<std::mutex> g(stats_.timing_mutex_);
+      stats_.host_alloc_time.reset_peak();
+      stats_.host_free_time.reset_peak();
+    }
+  }
+ private:
+  virtual void add_allocated_block(B* block) {
+    std::lock_guard<std::mutex> g(blocks_mutex_);
+    blocks_.insert(block);
+    stats_.allocation.increase(1);
+    stats_.allocated_bytes.increase(block->size_);
+    ptr_to_block_.insert({block->ptr_, block});
+    // Unfortunately, we have to, on the slow path, quickly
+    // lock the bucket to record the allocation. This should
+    // be a rare event once the cache is warmed up.
+    auto size = block->size_;
+    auto index = size_index(size);
+    {
+      std::lock_guard<std::mutex> g(free_list_[index].mutex_);
+      stats_.allocation_bucket_stats[index].increase(1);
+      stats_.allocated_bytes_bucket_stats[index].increase(size);
+    }
+  }
+  virtual B* get_free_block(size_t size) {
+    auto index = size_index(size);
+    std::lock_guard<std::mutex> g(free_list_[index].mutex_);
+    if (!free_list_[index].list_.empty()) {
+      B* block = free_list_[index].list_.back();
+      free_list_[index].list_.pop_back();
+      block->allocated_ = true;
+      stats_.allocation_bucket_stats[index].increase(1);
+      stats_.allocated_bytes_bucket_stats[index].increase(size);
+      return block;
+    }
+    return nullptr;
+  }
+  virtual void process_events() {
+    // process all events until the last unready event, not for specific size.
+    process_events_for_specific_size(-1);
+  }
+  // If size is -1, process all events from backwards until the last unready
+  // event. Otherwise, process events for a specific size and on first ready block
+  // is found, add it to the free list and return.
+  virtual void process_events_for_specific_size(int64_t size) {
+    size_t event_count = 0;
+    size_t max_events = 0;
+    {
+      std::lock_guard<std::mutex> g(events_mutex_);
+      max_events = events_.size();
+    }
+    while (true) {
+      // Avoid calling cudaEventDestroy while holding a mutex, so move
+      // intermediate events out of the lock into this object.
+      // process the last event
+      std::optional<std::pair<E, B*>> processed;
+      {
+        std::lock_guard<std::mutex> g(events_mutex_);
+        if (!events_.empty()) {
+          processed = std::move(events_.back());
+          events_.pop_back();
+        }
+      }
+      if (!processed) {
+        return;
+      }
+      if (size != -1) {
+        if (event_count++ > max_events) {
+          {
+            std::lock_guard<std::mutex> g(events_mutex_);
+            events_.push_front(std::move(*processed));
+          }
+          return;
+        }
+        if (size != (int64_t)processed->second->size_) {
+          // if we are processing a specific size, and the size of the block
+          // doesn't match, we can't use it.
+          {
+            std::lock_guard<std::mutex> g(events_mutex_);
+            events_.push_front(std::move(*processed));
+          }
+          continue;
+        }
+      }
+      // otherwise, query the event
+      {
+        // now, see if we can handle this element
+        auto& event = processed->first;
+        if (!query_event(event)) {
+          // push the event onto the back if it's not ready.
+          {
+            std::lock_guard<std::mutex> g(events_mutex_);
+            if (size == -1) {
+              events_.push_back(std::move(*processed));
+              return;
+            } else {
+              events_.push_front(std::move(*processed));
+              continue;
+            }
+          }
+        }
+      }
+      // Process the events.
+      TORCH_INTERNAL_ASSERT(processed);
+      auto* block = processed->second;
+      bool available = false;
+      {
+        std::lock_guard<std::mutex> g(block->mutex_);
+        TORCH_INTERNAL_ASSERT(!block->allocated_)
+        block->event_count_--;
+        if (block->event_count_ == 0) {
+          available = true;
+        }
+      }
+      if (available) {
+        auto index = size_index(block->size_);
+        std::lock_guard<std::mutex> g(free_list_[index].mutex_);
+        free_list_[index].list_.push_back(block);
+        stats_.allocation_bucket_stats[index].decrease(1);
+        stats_.allocated_bytes_bucket_stats[index].decrease(size);
+        if (size != -1) {
+          return;
+        }
+      }
+    }
+  }
+  TaskThreadPool* getBackgroundThreadPool() {
+    static TaskThreadPool* pool = new TaskThreadPool(1);
+    return pool;
+  }
+  /* These following functions are runtime-related. */
+  // Allocate page-locked memory on the host.
+  virtual void allocate_host_memory(size_t size, void** ptr) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "Not implemented for allocate_host_memory");
+  }
+  // Free block and release the pointer contained in block.
+  virtual void free_block(B* block) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
+  }
+  // Record an event on stream and store event into events.
+  virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
+  }
+  // Query event if it is completed.
+  virtual bool query_event(E& event) {
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
+  }
+  alignas(64) std::mutex blocks_mutex_;
+  ska::flat_hash_set<B*> blocks_; // block list
+  ska::flat_hash_map<void*, B*> ptr_to_block_;
+  // We keep free list as a vector of free lists, one for each power of two
+  // size. This allows us to quickly find a free block of the right size.
+  // We use deque to store per size free list and guard the list with its own
+  // mutex.
+  alignas(64) std::vector<FreeBlockList<B>> free_list_ =
+      std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
+  alignas(64) std::mutex events_mutex_;
+  std::deque<std::pair<E, B*>> events_; // event queue paired with block
+  // Indicates whether the object is active.
+  // Set to false in the destructor to signal background threads to stop.
+  std::atomic<bool> active_{true};
+protected:
+  alignas(64) HostStatsStaged stats_;
+};
+struct TORCH_API HostAllocator : public at::Allocator {
+  // Associates the pinned memory allocation with a stream to track
+  // dependencies. This ensures the memory won't be reused until the stream's
+  // operations complete
+  virtual bool record_event(void* ptr, void* ctx, c10::Stream stream) = 0;
+  // Frees all cached pinned memory and returns it to the system, clearing the
+  // allocator's internal cache
+  virtual void empty_cache() = 0;
+  // Returns comprehensive statistics about the allocator's memory usage,
+  // allocation patterns, and timing metrics
+  virtual HostStats get_stats() = 0;
+  // Resets the cumulative allocation statistics
+  virtual void reset_accumulated_stats() = 0;
+  // Resets the peak memory usage metrics
+  virtual void reset_peak_stats() = 0;
+};
+template <typename T, c10::DeleterFnPtr deleteFunc>
+struct CachingHostAllocatorInterface : public HostAllocator {
+  CachingHostAllocatorInterface() : impl_(std::make_unique<T>()) {}
+  at::DataPtr allocate(size_t size) override {
+    auto ptr_and_ctx = impl_->allocate(size);
+    return {
+        ptr_and_ctx.first,
+        ptr_and_ctx.second,
+        deleteFunc, // Use the template parameter deleter function
+        at::DeviceType::CPU};
+  }
+  void free(void* ctx) {
+    impl_->free(ctx);
+  }
+  bool record_event(void* ptr, void* ctx, c10::Stream stream) override {
+    return impl_->record_event(ptr, ctx, stream);
+  }
+  void empty_cache() override {
+    impl_->empty_cache();
+  }
+  void copy_data(void* dest, const void* src, std::size_t count)
+      const override {
+    impl_->copy_data(dest, src, count);
+  }
+  HostStats get_stats() override {
+    return impl_->getStats();
+  }
+  void reset_accumulated_stats() override {
+    impl_->resetAccumulatedStats();
+  }
+  void reset_peak_stats() override {
+    impl_->resetPeakStats();
+  }
+  std::unique_ptr<T> impl_;
+};
+#define DECLARE_HOST_ALLOCATOR(name, impl, deleter, instance)       \
+  void deleter(void* ptr);                                          \
+  struct name final                                                 \
+      : public at::CachingHostAllocatorInterface<impl, deleter> {}; \
+  static name instance;                                                    \
+  void deleter(void* ptr) {                                         \
+    instance.free(ptr);                                             \
+  }
+/**
+ * Set the host allocator for DeviceType `device_type`. This allocator manages
+ * pinned memory on the host that can be accessed efficiently by the specified
+ * device type. Note that this function is not thread-safe.
+ */
+TORCH_API void setHostAllocator(
+    at::DeviceType device_type,
+    at::HostAllocator* allocator,
+    uint8_t priority = 0);
+TORCH_API at::HostAllocator* getHostAllocator(at::DeviceType device_type);
+template <DeviceType device_type>
+struct HostAllocatorRegistry {
+  explicit HostAllocatorRegistry(HostAllocator* allocator) {
+    at::setHostAllocator(device_type, allocator);
+  }
+};
+#define REGISTER_HOST_ALLOCATOR(device_type, allocator) \
+  namespace {                                           \
+  static at::HostAllocatorRegistry<device_type>         \
+      g_host_allocator_registry_instance(allocator);    \
+  }
+} // namespace at
+C10_DIAGNOSTIC_POP()

.venv/lib/python3.12/site-packages/torch/include/ATen/core/CheckMemoryFormat.h ADDED Viewed

	@@ -0,0 +1,24 @@

+#include <c10/core/TensorOptions.h>
+namespace c10::impl {
+inline std::optional<MemoryFormat>
+check_tensor_options_and_extract_memory_format(
+    const TensorOptions& options,
+    std::optional<MemoryFormat> memory_format) {
+  TORCH_CHECK(
+      options.requires_grad_opt() != true,
+      "Operators taking TensorOptions cannot take a TensorOptions with "
+      "options.requires_grad set as true. This isn't implemented yet.");
+  TORCH_CHECK(
+      !(options.has_memory_format() && memory_format.has_value()),
+      "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
+      "the redundant setter.");
+  if (memory_format.has_value()) {
+    return memory_format;
+  } else {
+    return options.memory_format_opt();
+  }
+}
+} // namespace impl namespace c10

.venv/lib/python3.12/site-packages/torch/include/ATen/core/DeprecatedTypeProperties.h ADDED Viewed

	@@ -0,0 +1,139 @@

+#pragma once
+#include <c10/core/Backend.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/Layout.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/Storage.h>
+#include <ATen/core/DeprecatedTypePropertiesRegistry.h>
+#include <ATen/core/Generator.h>
+namespace at {
+class Tensor;
+// This class specifies a Backend and a ScalarType. Currently, it primarily
+// serves as a replacement return value for Tensor::type(). Previously,
+// Tensor::type() returned Type&, but we are changing Type to not be
+// dtype-specific.
+class TORCH_API DeprecatedTypeProperties {
+ public:
+  DeprecatedTypeProperties(Backend backend, ScalarType scalar_type)
+    : backend_(backend), scalar_type_(scalar_type) {}
+  Backend backend() const {
+    return backend_;
+  }
+  Layout layout() const {
+    return layout_from_backend(backend_);
+  }
+  bool is_sparse() const {
+    return layout_from_backend(backend()) == kSparse;
+  }
+  bool is_sparse_csr() const {
+    return layout_from_backend(backend()) == kSparseCsr;
+  }
+  c10::DeviceType device_type() const {
+    return backendToDeviceType(backend_);
+  }
+  bool is_cuda() const {
+    return backendToDeviceType(backend_) == kCUDA;
+  }
+  ScalarType scalarType() const {
+    return scalar_type_;
+  }
+  caffe2::TypeMeta typeMeta() const {
+    return scalarTypeToTypeMeta(scalar_type_);
+  }
+  bool operator==(const DeprecatedTypeProperties& other) const {
+    return backend_ == other.backend() && scalar_type_ == other.scalarType();
+  }
+  bool operator!=(const DeprecatedTypeProperties& other) const {
+    return !(*this == other);
+  }
+  std::string toString() const {
+    std::string base_str;
+    if (backend_ == Backend::Undefined || scalar_type_ == ScalarType::Undefined) {
+      base_str = "UndefinedType";
+    } else {
+      base_str = std::string(at::toString(backend_)) + at::toString(scalar_type_) + "Type";
+    }
+    return base_str;
+  }
+  DeprecatedTypeProperties & toBackend(Backend b) const {
+    return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+        b, scalar_type_);
+  }
+  DeprecatedTypeProperties & toScalarType(ScalarType s) const {
+    return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
+        backend_, s);
+  }
+  DeprecatedTypeProperties & cpu() const {
+    return toBackend(Backend::CPU);
+  }
+  DeprecatedTypeProperties & cuda() const {
+    return toBackend(Backend::CUDA);
+  }
+  DeprecatedTypeProperties & hip() const {
+    return toBackend(Backend::HIP);
+  }
+  DeprecatedTypeProperties & privateUser1() const {
+    return toBackend(Backend::PrivateUse1);
+  }
+  /// Constructs the `TensorOptions` from a type and a `device_index`.
+  TensorOptions options(int16_t device_index = -1) const {
+    return TensorOptions().dtype(typeMeta())
+                          .device(device_type(), static_cast<c10::DeviceIndex>(device_index))
+                          .layout(layout());
+  }
+  /// Constructs the `TensorOptions` from a type and a Device.  Asserts that
+  /// the device type matches the device type of the type.
+  TensorOptions options(std::optional<Device> device_opt) const {
+    if (!device_opt.has_value()) {
+      return options(-1);
+    } else {
+      Device device = device_opt.value();
+      AT_ASSERT(device.type() == device_type());
+      return options(device.index());
+    }
+  }
+  operator TensorOptions() const {
+    return options();
+  }
+  int64_t id() const {
+    return static_cast<int64_t>(backend()) *
+        static_cast<int64_t>(ScalarType::NumOptions) +
+        static_cast<int64_t>(scalarType());
+  }
+  Tensor unsafeTensorFromTH(void * th_pointer, bool retain) const;
+  Storage unsafeStorageFromTH(void * th_pointer, bool retain) const;
+  Tensor copy(const Tensor & src, bool non_blocking=false, std::optional<Device> to_device={}) const;
+ private:
+  Backend backend_;
+  ScalarType scalar_type_;
+};
+}  // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/DeprecatedTypePropertiesRegistry.h ADDED Viewed

	@@ -0,0 +1,33 @@

+#pragma once
+// In order to preserve bc, we make DeprecatedTypeProperties instances unique
+// just like they are for Type.
+#include <c10/core/Backend.h>
+#include <c10/core/ScalarType.h>
+#include <memory>
+namespace at {
+class DeprecatedTypeProperties;
+struct TORCH_API DeprecatedTypePropertiesDeleter {
+  void operator()(DeprecatedTypeProperties * ptr);
+};
+class TORCH_API DeprecatedTypePropertiesRegistry {
+ public:
+  DeprecatedTypePropertiesRegistry();
+  DeprecatedTypeProperties& getDeprecatedTypeProperties(Backend p, ScalarType s) const;
+private:
+  // NOLINTNEXTLINE(*c-array*)
+  std::unique_ptr<DeprecatedTypeProperties> registry
+    [static_cast<int>(Backend::NumOptions)]
+    [static_cast<int>(ScalarType::NumOptions)];
+};
+TORCH_API DeprecatedTypePropertiesRegistry& globalDeprecatedTypePropertiesRegistry();
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Dict.h ADDED Viewed

	@@ -0,0 +1,396 @@

+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/macros/Export.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/order_preserving_flat_hash_map.h>
+#include <optional>
+#include <ATen/core/TensorBody.h>
+#include <ATen/core/jit_type_base.h>
+namespace c10 {
+struct IValue;
+template<class Key, class Value> class Dict;
+struct Type;
+namespace impl {
+using valid_dict_key_types = guts::typelist::typelist<
+  int64_t,
+  std::string,
+  double,
+  c10::complex<double>,
+  bool,
+  at::Tensor
+>;
+}
+namespace detail {
+struct DictKeyHash {
+  size_t operator()(const IValue& ivalue) const;
+};
+struct DictKeyEqualTo {
+  bool operator()(const IValue& lhs, const IValue& rhs) const;
+};
+struct DictImpl final : public c10::intrusive_ptr_target {
+  using dict_map_type = ska_ordered::order_preserving_flat_hash_map<IValue, IValue, DictKeyHash, DictKeyEqualTo>;
+  struct DictElementTypes final {
+    TypePtr keyType;
+    TypePtr valueType;
+  };
+  explicit DictImpl(dict_map_type dict_, DictElementTypes elementTypes_)
+  : dict(std::move(dict_))
+  , elementTypes(std::move(elementTypes_)) {}
+  dict_map_type dict;
+  DictElementTypes elementTypes;
+  intrusive_ptr<DictImpl> copy() const;
+  friend TORCH_API bool operator==(const DictImpl& lhs, const DictImpl& rhs);
+};
+}
+namespace impl {
+template<class Key, class Value, class Iterator> class DictIterator;
+/**
+ * A reference to an entry in the Dict.
+ * Use the `key()` and `value()` methods to read the element.
+ */
+template<class Key, class Value, class Iterator>
+class DictEntryRef final {
+public:
+  explicit DictEntryRef(Iterator iterator)
+  : iterator_(std::move(iterator)) {}
+  decltype(auto) key() const {
+    return iterator_->first.template to<Key>();
+  }
+  decltype(auto) value() const {
+    return iterator_->second.template to<Value>();
+  }
+  template<class Value_>
+  void setValue(Value_&& value) const {
+    static_assert(std::is_constructible_v<Value, Value_>, "Wrong type for the value argument of setValue()");
+    iterator_->second = Value(std::forward<Value_>(value));
+  }
+  ~DictEntryRef() = default;
+private:
+  // allow copying and moving, but only our friends (i.e. the Dict class) can do
+  // it. Copying/moving this reference wrapper would be too ambiguous to allow it
+  // in the public API.
+  DictEntryRef(const DictEntryRef&) = default;
+  DictEntryRef& operator=(const DictEntryRef&) = default;
+  DictEntryRef(DictEntryRef&&) noexcept = default;
+  DictEntryRef& operator=(DictEntryRef&& rhs) & noexcept = default;
+  Iterator iterator_;
+  friend class DictIterator<Key, Value, Iterator>;
+  friend class Dict<Key, Value>;
+};
+// this wraps map_type::iterator to make sure user code can't rely
+// on it being the type of the underlying map.
+template<class Key, class Value, class Iterator>
+class DictIterator final {
+public:
+   // C++17 friendly std::iterator implementation
+  using iterator_category = std::forward_iterator_tag;
+  using value_type = DictEntryRef<Key, Value, Iterator>;
+  using difference_type = std::ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  explicit DictIterator() = default;
+  ~DictIterator() = default;
+  DictIterator(const DictIterator& rhs): entryRef_(rhs.entryRef_) {}
+  DictIterator(DictIterator&& rhs) noexcept: entryRef_(std::move(rhs.entryRef_)) {}
+  DictIterator& operator=(const DictIterator& rhs) = default;
+  DictIterator& operator=(DictIterator&& rhs) noexcept {
+    entryRef_ = std::move(rhs.entryRef_);
+    return *this;
+  }
+  DictIterator& operator++() {
+      ++entryRef_.iterator_;
+      return *this;
+  }
+  DictIterator operator++(int) {
+      DictIterator copy(*this);
+      ++*this;
+      return copy;
+  }
+  const DictEntryRef<Key, Value, Iterator>& operator*() const {
+      return entryRef_;
+  }
+  const DictEntryRef<Key, Value, Iterator>* operator->() const {
+    return &entryRef_;
+  }
+  friend difference_type operator-(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.entryRef_.iterator_ - rhs.entryRef_.iterator_;
+  }
+private:
+  explicit DictIterator(Iterator iterator): entryRef_(std::move(iterator)) {}
+  const Iterator& get_iterator_() const {
+    return entryRef_.iterator_;
+  }
+  friend bool operator==(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() == rhs.get_iterator_();
+  }
+  friend bool operator!=(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() != rhs.get_iterator_();
+  }
+  friend bool operator<(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() < rhs.get_iterator_();
+  }
+  friend bool operator<=(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() <= rhs.get_iterator_();
+  }
+  friend bool operator>(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() > rhs.get_iterator_();
+  }
+  friend bool operator>=(const DictIterator& lhs, const DictIterator& rhs) {
+    return lhs.get_iterator_() >= rhs.get_iterator_();
+  }
+  DictEntryRef<Key, Value, Iterator> entryRef_;
+  friend class DictIterator<Key, Value, typename c10::detail::DictImpl::dict_map_type::iterator>;
+  friend class Dict<Key, Value>;
+};
+template<class Key, class Value> Dict<Key, Value> toTypedDict(Dict<IValue, IValue> dict);
+template<class Key, class Value> Dict<IValue, IValue> toGenericDict(Dict<Key, Value> dict);
+}
+/**
+ * An object of this class stores a map from Key to Value.
+ *
+ * This is a pointer type. After a copy, both Dicts
+ * will share the same storage:
+ *
+ * > Dict<int, string> a;
+ * > Dict<int, string> b = a;
+ * > b.insert(3, "three");
+ * > ASSERT("three" == a.at(3));
+ *
+ * We use this class in the PyTorch kernel API because that
+ * allows us to do optimizations and switch out the underlying
+ * map implementation without breaking backwards compatibility
+ * for the kernel API.
+ */
+template<class Key, class Value>
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+class Dict final {
+private:
+  static_assert((std::is_same_v<IValue, Key> && std::is_same_v<IValue, Value>) || guts::typelist::contains<impl::valid_dict_key_types, Key>::value, "Invalid Key type for Dict. We only support int64_t, double, bool, and string.");
+  // impl_ stores the underlying map as a ska_ordered::order_preserving_flat_hash_map.
+  // We intentionally don't offer conversion from/to
+  // order_preserving_flat_hash_map, return references to it or something like that,
+  // because such operations would get expensive if we switch out
+  // the actual map implementation.
+  // This is an intrusive_ptr because Dict is a pointer type.
+  // Invariant: This will never be a nullptr, there will always be a valid
+  // DictImpl.
+  c10::intrusive_ptr<detail::DictImpl> impl_;
+  explicit Dict(c10::intrusive_ptr<detail::DictImpl>&& impl);
+  friend struct IValue;
+  template<class K, class V> friend Dict<K, V> impl::toTypedDict(Dict<IValue, IValue>);
+  template<class K, class V> friend Dict<IValue, IValue> impl::toGenericDict(Dict<K, V>);
+public:
+  using key_type = Key;
+  using mapped_type = Value;
+  using size_type = typename detail::DictImpl::dict_map_type::size_type;
+  using iterator = impl::DictIterator<Key, Value, typename detail::DictImpl::dict_map_type::iterator>;
+  /**
+   * Creates an empty dict.
+   */
+  explicit Dict();
+  /**
+   * Create a generic dict with runtime type information.
+   * This only works for c10::impl::GenericDict and is not part of the public API
+   * but only supposed to be used internally by PyTorch.
+   */
+  explicit Dict(TypePtr keyType, TypePtr valueType);
+  ~Dict() = default;
+  Dict(const Dict&) = default;
+  Dict& operator=(const Dict&) = default;
+  /**
+   * Create a new Dict pointing to a deep copy of the same data.
+   * The Dict returned is a new dict with separate storage.
+   * Changes in it are not reflected in the original dict or vice versa.
+   */
+  Dict copy() const;
+  /**
+   * Returns an iterator to the first element of the container.
+   * If the container is empty, the returned iterator will be equal to end().
+   */
+  iterator begin() const;
+  /**
+   * Returns an iterator to the element following the last element of the container.
+   * This element acts as a placeholder; attempting to access it results in undefined behavior.
+   */
+  iterator end() const;
+  /**
+   * Checks if the container has no elements.
+   */
+  bool empty() const;
+  /**
+   * Returns the number of elements in the container.
+   */
+  size_type size() const;
+  /**
+   * Erases all elements from the container. After this call, size() returns zero.
+   * Invalidates any references, pointers, or iterators referring to contained elements. May also invalidate past-the-end iterators.
+   */
+  void clear() const;
+  /**
+   * Inserts element(s) into the container, if the container doesn't already contain an element with an equivalent key.
+   * May invalidate any references, pointers, or iterators referring to contained elements.
+   *
+   * @return A pair consisting of an iterator to the inserted element (or to the element that prevented the insertion) and a bool denoting whether the insertion took place.
+   */
+  template<class Key_, class Value_>
+  std::pair<iterator, bool> insert(Key_&& key, Value_&& value) const;
+  /**
+   * If an element with the given key already exists, it is overwritten with the given value.
+   * Otherwise, a new element with the given key and value are inserted.
+   * May invalidate any references, pointers, or iterators referring to contained elements.
+   *
+   * @return The bool component is true if the insertion took place and false if the assignment took place. The iterator component is pointing at the element that was inserted or updated.
+   */
+  template<class Key_, class Value_>
+  std::pair<iterator, bool> insert_or_assign(Key_&& key, Value_&& value) const;
+  /**
+   * Removes the element pointed to by iter.
+   * May invalidate any references, pointers, or iterators referring to contained elements.
+   * The iterator iter must be valid and dereferenceable. Thus the end() iterator (which is valid, but is not dereferenceable) cannot be used as a value for iter.
+   */
+  void erase(iterator iter) const;
+  /**
+   * Removes the element with the given key, if it exists.
+   * May invalidate any references, pointers, or iterators referring to contained elements.
+   *
+   * @return The number of elements removed. This is either '1' if an element with the key existed, or '0' if it didn't.
+   */
+  [[nodiscard]] size_t erase(const Key& key) const;
+  /**
+   * Returns the mapped value of the element with key equivalent to key.
+   * If no such element exists, an exception of type std::out_of_range is thrown.
+   */
+  Value at(const Key& key) const;
+  /**
+   * Finds an element with key equivalent to key.
+   *
+   * @return Iterator to an element with key equivalent to key.
+   *         If no such element is found, past-the-end (see end()) iterator is returned.
+   */
+  iterator find(const Key& key) const;
+  /**
+   * Checks if there is an element with key equivalent to key in the container.
+   *
+   * @return true if there is such an element, otherwise false.
+   */
+  bool contains(const Key& key) const;
+  /**
+   * Increase the capacity so that at least count elements can be stored without
+   * having to reallocate or rehash.
+   */
+  void reserve(size_type count) const;
+  /**
+   * Value equality comparison. This function implements Python-like semantics for
+   * equality: two dicts with the same identity (e.g. same pointer) trivially
+   * compare equal, otherwise each element is compared for equality.
+   */
+  template <class Key_, class Value_>
+  friend bool operator==(
+      const Dict<Key_, Value_>& lhs,
+      const Dict<Key_, Value_>& rhs);
+  template <class Key_, class Value_>
+  friend bool operator!=(
+      const Dict<Key_, Value_>& lhs,
+      const Dict<Key_, Value_>& rhs);
+  /**
+   * Identity comparison. Returns true if and only if `rhs` represents the same
+   * Dict object as `this`.
+   */
+  bool is(const Dict& rhs) const;
+  // private API for now because the return type will change to TypePtr
+  // instead of std::optional<TypePtr> once types are mandatory.
+  TypePtr keyType() const;
+  TypePtr valueType() const;
+  // [unsafe set type]
+  // These functions mutate the tagged type of this dictionary in place.
+  // There is no checking that the members of the dictionary are instances
+  // of the new types, nor is there a check that other IValues which
+  // hold references to this dictionary have the right static type.
+  // This functionality is used only in the unpickler, where at
+  // creation type the real type of the dictionary is unknown, but
+  // then later recovered from the static type information of the
+  // unpickled object.
+  void unsafeSetKeyType(TypePtr t);
+  void unsafeSetValueType(TypePtr t);
+};
+namespace impl {
+// GenericDict is how IValue stores dicts. It is, however, not part of the
+// public API. Kernels should use Dicts with concrete Key, Value types instead
+// (maybe except for some internal prim ops).
+using GenericDict = Dict<IValue, IValue>;
+}
+}
+namespace torch {
+  template<class Key, class Value> using Dict = c10::Dict<Key, Value>;
+}
+#include <ATen/core/Dict_inl.h>  // IWYU pragma: keep

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Dict_inl.h ADDED Viewed

	@@ -0,0 +1,208 @@

+#pragma once
+#include <ATen/core/ivalue.h>
+#include <c10/util/hash.h>
+namespace c10 {
+namespace detail {
+inline bool DictKeyEqualTo::operator()(const IValue& lhs, const IValue& rhs) const {
+  if (lhs.isTensor() && rhs.isTensor()) {
+    // for tensors, we compare only by identity (following how it's done in Python).
+    return lhs.is(rhs);
+  }
+  // Otherwise, we first compare by identity for efficiency, then by value (see:
+  // [container equality])
+  return _fastEqualsForContainer(lhs, rhs);
+}
+}
+template<class T> decltype(auto) getTypePtr();
+std::string toString(const Type& type);
+namespace impl {
+template<class Key, class Value>
+Dict<Key, Value> toTypedDict(GenericDict dict) {
+  TORCH_INTERNAL_ASSERT(*getTypePtr<Key>() == *dict.impl_->elementTypes.keyType, "Tried to cast a Dict<", toString(*dict.impl_->elementTypes.keyType), ", ", toString(*dict.impl_->elementTypes.valueType) ,"> to a Dict<", toString(*getTypePtr<Key>()), ", ", toString(*getTypePtr<Value>()), ">. Key types mismatch.");
+  TORCH_INTERNAL_ASSERT(*getTypePtr<Value>() == *dict.impl_->elementTypes.valueType, "Tried to cast a Dict<", toString(*dict.impl_->elementTypes.keyType), ", ", toString(*dict.impl_->elementTypes.valueType) ,"> to a Dict<", toString(*getTypePtr<Key>()), ", ", toString(*getTypePtr<Value>()), ">. Value types mismatch.");
+  return Dict<Key, Value>(std::move(dict.impl_));
+}
+template<class Key, class Value>
+GenericDict toGenericDict(Dict<Key, Value> dict) {
+  return GenericDict(std::move(dict.impl_));
+}
+}
+namespace detail {
+inline size_t DictKeyHash::operator()(const IValue& ivalue) const {
+  if (ivalue.isInt()) {
+    return std::hash<int64_t>()(ivalue.toInt());
+  } else if (ivalue.isString()) {
+    return std::hash<std::string_view>()(ivalue.toStringView());
+  } else if (ivalue.isDouble()) {
+    return std::hash<double>()(ivalue.toDouble());
+  } else if (ivalue.isComplexDouble()) {
+    return c10::hash<c10::complex<double>>()(ivalue.toComplexDouble());
+  } else if (ivalue.isBool()) {
+    return std::hash<bool>()(ivalue.toBool());
+  } else if (ivalue.isTensor()) {
+    return std::hash<TensorImpl*>()(ivalue.toTensor().unsafeGetTensorImpl());
+  } else if (ivalue.isDevice()) {
+    return std::hash<Device>()(ivalue.toDevice());
+  } else {
+    TORCH_CHECK(false, "Can't hash IValues with tag '", ivalue.tagKind(), "'");
+  }
+}
+inline intrusive_ptr<DictImpl> DictImpl::copy() const {
+  return make_intrusive<DictImpl>(dict, elementTypes);
+}
+}
+template<class Key, class Value>
+Dict<Key, Value>::Dict()
+  :Dict(make_intrusive<detail::DictImpl>(
+      detail::DictImpl::dict_map_type(),
+      detail::DictImpl::DictElementTypes{getTypePtr<Key>(), getTypePtr<Value>()})) {
+  static_assert(!std::is_same_v<Key, IValue>, "This constructor is not valid for Dict<IValue, _>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
+  static_assert(!std::is_same_v<Value, IValue>, "This constructor is not valid for Dict<_, IValue>. Please use c10::impl::GenericDict(keyType, valueType) instead.");
+}
+template<class Key, class Value>
+Dict<Key, Value>::Dict(TypePtr keyType, TypePtr valueType)
+: Dict(make_intrusive<detail::DictImpl>(
+    detail::DictImpl::dict_map_type(),
+    detail::DictImpl::DictElementTypes {std::move(keyType), std::move(valueType)})) {
+  static_assert(std::is_same_v<Key, IValue>, "This constructor is only valid for c10::impl::GenericDict.");
+  static_assert(std::is_same_v<Value, IValue>, "This constructor is only valid for c10::impl::GenericDict.");
+}
+template<class Key, class Value>
+Dict<Key, Value>::Dict(c10::intrusive_ptr<detail::DictImpl>&& impl): impl_(std::move(impl)) {}
+template<class Key, class Value>
+Dict<Key, Value> Dict<Key, Value>::copy() const {
+  return Dict<Key, Value>(impl_->copy());
+}
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::begin() const {
+  return iterator{impl_->dict.begin()};
+}
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::end() const {
+  return iterator{impl_->dict.end()};
+}
+template<class Key, class Value>
+bool Dict<Key, Value>::empty() const {
+  return impl_->dict.empty();
+}
+template<class Key, class Value>
+typename Dict<Key, Value>::size_type Dict<Key, Value>::size() const {
+  return impl_->dict.size();
+}
+template<class Key, class Value>
+void Dict<Key, Value>::clear() const {
+  impl_->dict.clear();
+}
+template<class Key, class Value>
+template<class Key_, class Value_>
+std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert(Key_&& key, Value_&& value) const {
+  static_assert(std::is_constructible_v<Key, Key_>, "Wrong type for the key argument of Dict::insert");
+  static_assert(std::is_constructible_v<Value, Value_>, "Wrong type for the value argument of Dict::insert");
+  auto inserted = impl_->dict.emplace(
+      Key(std::forward<Key_>(key)),
+      Value(std::forward<Value_>(value)));
+  return {iterator{inserted.first}, inserted.second};
+}
+template<class Key, class Value>
+template<class Key_, class Value_>
+std::pair<typename Dict<Key, Value>::iterator, bool> Dict<Key, Value>::insert_or_assign(Key_&& key, Value_&& value) const {
+  static_assert(std::is_constructible_v<Key, Key_>, "Wrong type for the key argument of Dict::insert_or_assign");
+  static_assert(std::is_constructible_v<Value, Value_>, "Wrong type for the value argument of Dict::insert_or_assign");
+  auto inserted = impl_->dict.insert_or_assign(
+    Key(std::forward<Key_>(key)),
+    Value(std::forward<Value_>(value)));
+  return {iterator{inserted.first}, inserted.second};
+}
+template<class Key, class Value>
+void Dict<Key, Value>::erase(iterator iter) const {
+  impl_->dict.erase(iter.entryRef_.iterator_);
+}
+template <class Key, class Value>
+[[nodiscard]] size_t Dict<Key, Value>::erase(const Key& key) const {
+  return impl_->dict.erase(key);
+}
+template<class Key, class Value>
+Value Dict<Key, Value>::at(const Key& key) const {
+  return impl_->dict.at(key).template to<Value>();
+}
+template<class Key, class Value>
+typename Dict<Key, Value>::iterator Dict<Key, Value>::find(const Key& key) const {
+  return iterator{impl_->dict.find(key)};
+}
+template<class Key, class Value>
+bool Dict<Key, Value>::contains(const Key& key) const {
+  return end() != find(key);
+}
+template<class Key, class Value>
+void Dict<Key, Value>::reserve(size_type count) const {
+  impl_->dict.reserve(count);
+}
+template<class Key, class Value>
+TypePtr Dict<Key, Value>::keyType() const {
+  return impl_->elementTypes.keyType;
+}
+template<class Key, class Value>
+TypePtr Dict<Key, Value>::valueType() const {
+  return impl_->elementTypes.valueType;
+}
+template <class Key, class Value>
+void Dict<Key, Value>::unsafeSetKeyType(TypePtr t) {
+  impl_->elementTypes.keyType = std::move(t);
+}
+template <class Key, class Value>
+void Dict<Key, Value>::unsafeSetValueType(TypePtr t) {
+  impl_->elementTypes.valueType = std::move(t);
+}
+template <class Key_, class Value_>
+bool operator==(const Dict<Key_, Value_>& lhs, const Dict<Key_, Value_>& rhs) {
+  // Dicts with the same identity trivially compare equal.
+  if (lhs.impl_ == rhs.impl_) {
+    return true;
+  }
+  // Otherwise compare the values
+  return *lhs.impl_ == *rhs.impl_;
+}
+template <class Key_, class Value_>
+bool operator!=(const Dict<Key_, Value_>& lhs, const Dict<Key_, Value_>& rhs) {
+  return !(lhs == rhs);
+}
+template <class Key, class Value>
+bool Dict<Key, Value>::is(const Dict& rhs) const {
+  return this->impl_ == rhs.impl_;
+}
+}

.venv/lib/python3.12/site-packages/torch/include/ATen/core/DimVector.h ADDED Viewed

	@@ -0,0 +1,13 @@

+#pragma once
+#include <c10/util/DimVector.h>
+namespace at {
+// Re-declaring 'DimVector' type and size inside 'at' namespace.
+// This is done to avoid modifying every use into their 'c10'
+// equivalent.
+using c10::kDimVectorStaticSize;
+using c10::DimVector;
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Dimname.h ADDED Viewed

	@@ -0,0 +1,48 @@

+#pragma once
+#include <ATen/core/symbol.h>
+#include <c10/util/ArrayRef.h>
+#include <optional>
+#include <ostream>
+namespace at {
+enum class NameType: uint8_t { BASIC, WILDCARD };
+struct TORCH_API Dimname {
+  static Dimname fromSymbol(Symbol name);
+  static Dimname wildcard();
+  static bool isValidName(const std::string& name);
+  NameType type() const { return type_; }
+  Symbol symbol() const { return name_; }
+  bool isBasic() const { return type_ == NameType::BASIC; }
+  bool isWildcard() const { return type_ == NameType::WILDCARD; }
+  bool matches(Dimname other) const;
+  std::optional<Dimname> unify(Dimname other) const;
+ private:
+  Dimname(Symbol name)
+    : name_(name), type_(NameType::BASIC) {}
+  Dimname(Symbol name, NameType type)
+    : name_(name), type_(type) {}
+  Symbol name_;
+  NameType type_;
+};
+using DimnameList = c10::ArrayRef<Dimname>;
+TORCH_API std::ostream& operator<<(std::ostream& out, const Dimname& dimname);
+inline bool operator==(const Dimname& lhs, const Dimname& rhs) {
+  return lhs.symbol() == rhs.symbol();
+}
+inline bool operator!=(const Dimname& lhs, const Dimname& rhs) {
+  return !(lhs == rhs);
+}
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/DistributionsHelper.h ADDED Viewed

	@@ -0,0 +1,332 @@

+#pragma once
+#include <ATen/core/TransformationHelper.h>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/MathConstants.h>
+#include <c10/macros/Macros.h>
+#include <cmath>
+#include <limits>
+#include <optional>
+#include <type_traits>
+/**
+ * Distributions kernel adapted from THRandom.cpp
+ * The kernels try to follow std::random distributions signature
+ * For instance: in ATen
+ *      auto gen = at::detail::createCPUGenerator();
+ *      at::uniform_real_distribution<double> uniform(0, 1);
+ *      auto sample = uniform(gen.get());
+ *
+ *      vs std::random
+ *
+ *      std::mt19937 gen;
+ *      std::uniform_real_distribution uniform(0, 1);
+ *      auto sample = uniform(gen);
+ */
+namespace at {
+namespace {
+/**
+ * Samples a discrete uniform distribution in the range [base, base+range) of type T
+ */
+template <typename T>
+struct uniform_int_from_to_distribution {
+  C10_HOST_DEVICE inline uniform_int_from_to_distribution(uint64_t range, int64_t base) : range_(range), base_(base) {}
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+#ifdef FBCODE_CAFFE2
+    if ((
+      std::is_same_v<T, int64_t> ||
+      std::is_same_v<T, double> ||
+      std::is_same_v<T, float> ||
+      std::is_same_v<T, at::BFloat16>) && range_ >= 1ULL << 32)
+#else
+    if (range_ >= 1ULL << 28) // allow approx 5% skew in uniform int generation using %
+#endif
+    {
+      return transformation::uniform_int_from_to<T>(generator->random64(), range_, base_);
+    } else {
+      return transformation::uniform_int_from_to<T>(generator->random(), range_, base_);
+    }
+  }
+  private:
+    uint64_t range_;
+    int64_t base_;
+};
+/**
+ * Samples a discrete uniform distribution in the range [min_value(int64_t), max_value(int64_t)]
+ */
+template <typename T>
+struct uniform_int_full_range_distribution {
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    return transformation::uniform_int_full_range<T>(generator->random64());
+  }
+};
+/**
+ * Samples a discrete uniform distribution in the range [0, max_value(T)] for integral types
+ * and [0, 2^mantissa] for floating-point types.
+ */
+template <typename T>
+struct uniform_int_distribution {
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    if constexpr (std::is_same_v<T, double> || std::is_same_v<T, int64_t>) {
+      return transformation::uniform_int<T>(generator->random64());
+    } else {
+      return transformation::uniform_int<T>(generator->random());
+    }
+  }
+};
+/**
+ * Samples a uniform distribution in the range [from, to) of type T
+ */
+template <typename T>
+struct uniform_real_distribution {
+  C10_HOST_DEVICE inline uniform_real_distribution(T from, T to) : from_(from), to_(to) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(from <= to);
+    TORCH_CHECK_IF_NOT_ON_CUDA(to - from <= std::numeric_limits<T>::max());
+  }
+  template <typename RNG>
+  C10_HOST_DEVICE inline dist_acctype<T> operator()(RNG generator){
+    if constexpr (std::is_same_v<T, double>) {
+      return transformation::uniform_real<T>(generator->random64(), from_, to_);
+    } else {
+      return transformation::uniform_real<T>(generator->random(), from_, to_);
+    }
+  }
+  private:
+    T from_;
+    T to_;
+};
+// The SFINAE checks introduced in #39816 looks overcomplicated and must revisited
+// https://github.com/pytorch/pytorch/issues/40052
+#define DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(member)              \
+template <typename T>                                                \
+struct has_member_##member                                           \
+{                                                                    \
+    typedef char yes;                                                \
+    typedef long no;                                                 \
+    template <typename U> static yes test(decltype(&U::member));     \
+    template <typename U> static no test(...);                       \
+    static constexpr bool value = sizeof(test<T>(0)) == sizeof(yes); \
+}
+DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(next_double_normal_sample);
+DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(set_next_double_normal_sample);
+DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(next_float_normal_sample);
+DISTRIBUTION_HELPER_GENERATE_HAS_MEMBER(set_next_float_normal_sample);
+#define DISTRIBUTION_HELPER_GENERATE_NEXT_NORMAL_METHODS(TYPE)                                      \
+                                                                                                    \
+template <typename RNG, typename ret_type,                                                          \
+          typename std::enable_if_t<(                                                               \
+            has_member_next_##TYPE##_normal_sample<RNG>::value &&                                   \
+            has_member_set_next_##TYPE##_normal_sample<RNG>::value                                  \
+          ), int> = 0>                                                                              \
+C10_HOST_DEVICE inline bool maybe_get_next_##TYPE##_normal_sample(RNG* generator, ret_type* ret) {  \
+  if (generator->next_##TYPE##_normal_sample()) {                                                   \
+    *ret = *(generator->next_##TYPE##_normal_sample());                                             \
+    generator->set_next_##TYPE##_normal_sample(std::optional<TYPE>());                              \
+    return true;                                                                                    \
+  }                                                                                                 \
+  return false;                                                                                     \
+}                                                                                                   \
+                                                                                                    \
+template <typename RNG, typename ret_type,                                                          \
+          typename std::enable_if_t<(                                                               \
+            !has_member_next_##TYPE##_normal_sample<RNG>::value ||                                  \
+            !has_member_set_next_##TYPE##_normal_sample<RNG>::value                                 \
+          ), int> = 0>                                                                              \
+C10_HOST_DEVICE inline bool maybe_get_next_##TYPE##_normal_sample(RNG* /*generator*/, ret_type* /*ret*/) {  \
+  return false;                                                                                     \
+}                                                                                                   \
+                                                                                                    \
+template <typename RNG, typename ret_type,                                                          \
+          typename std::enable_if_t<(                                                               \
+            has_member_set_next_##TYPE##_normal_sample<RNG>::value                                  \
+          ), int> = 0>                                                                              \
+C10_HOST_DEVICE inline void maybe_set_next_##TYPE##_normal_sample(RNG* generator, ret_type cache) { \
+  generator->set_next_##TYPE##_normal_sample(cache);                                                \
+}                                                                                                   \
+                                                                                                    \
+template <typename RNG, typename ret_type,                                                          \
+          typename std::enable_if_t<(                                                               \
+            !has_member_set_next_##TYPE##_normal_sample<RNG>::value                                 \
+          ), int> = 0>                                                                              \
+C10_HOST_DEVICE inline void maybe_set_next_##TYPE##_normal_sample(RNG* /*generator*/, ret_type /*cache*/) { \
+}
+DISTRIBUTION_HELPER_GENERATE_NEXT_NORMAL_METHODS(double)
+DISTRIBUTION_HELPER_GENERATE_NEXT_NORMAL_METHODS(float)
+/**
+ * Samples a normal distribution using the Box-Muller method
+ * Takes mean and standard deviation as inputs
+ * Note that Box-muller method returns two samples at a time.
+ * Hence, we cache the "next" sample in the CPUGeneratorImpl class.
+ */
+template <typename T>
+struct normal_distribution {
+  C10_HOST_DEVICE inline normal_distribution(T mean_in, T stdv_in) : mean(mean_in), stdv(stdv_in) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in >= 0, "stdv_in must be positive: ", stdv_in);
+  }
+  template <typename RNG>
+  C10_HOST_DEVICE inline dist_acctype<T> operator()(RNG generator){
+    dist_acctype<T> ret;
+    // return cached values if available
+    if constexpr (std::is_same_v<T, double>) {
+      if (maybe_get_next_double_normal_sample(generator, &ret)) {
+        return transformation::normal(ret, mean, stdv);
+      }
+    } else {
+      if (maybe_get_next_float_normal_sample(generator, &ret)) {
+        return transformation::normal(ret, mean, stdv);
+      }
+    }
+    // otherwise generate new normal values
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    const dist_acctype<T> u1 = uniform(generator);
+    const dist_acctype<T> u2 = uniform(generator);
+    const dist_acctype<T> r = ::sqrt(static_cast<T>(-2.0) * ::log1p(-u2));
+    const dist_acctype<T> theta = static_cast<T>(2.0) * c10::pi<T> * u1;
+    if constexpr (std::is_same_v<T, double>) {
+      maybe_set_next_double_normal_sample(generator, r * ::sin(theta));
+    } else {
+      maybe_set_next_float_normal_sample(generator, r * ::sin(theta));
+    }
+    ret = r * ::cos(theta);
+    return transformation::normal(ret, mean, stdv);
+  }
+  private:
+    T mean;
+    T stdv;
+};
+template <typename T>
+struct DiscreteDistributionType { using type = float; };
+template <> struct DiscreteDistributionType<double> { using type = double; };
+/**
+ * Samples a bernoulli distribution given a probability input
+ */
+template <typename T>
+struct bernoulli_distribution {
+  C10_HOST_DEVICE inline bernoulli_distribution(T p_in) : p(p_in) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(p_in >= 0 && p_in <= 1);
+  }
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    return transformation::bernoulli<T>(uniform(generator), p);
+  }
+  private:
+    T p;
+};
+/**
+ * Samples a geometric distribution given a probability input
+ */
+template <typename T>
+struct geometric_distribution {
+  C10_HOST_DEVICE inline geometric_distribution(T p_in) : p(p_in) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(p_in > 0 && p_in < 1);
+  }
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    return transformation::geometric<T>(uniform(generator), p);
+  }
+  private:
+    T p;
+};
+/**
+ * Samples an exponential distribution given a lambda input
+ */
+template <typename T>
+struct exponential_distribution {
+  C10_HOST_DEVICE inline exponential_distribution(T lambda_in) : lambda(lambda_in) {}
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    return transformation::exponential<T>(uniform(generator), lambda);
+  }
+  private:
+    T lambda;
+};
+/**
+ * Samples a cauchy distribution given median and sigma as inputs
+ */
+template <typename T>
+struct cauchy_distribution {
+  C10_HOST_DEVICE inline cauchy_distribution(T median_in, T sigma_in) : median(median_in), sigma(sigma_in) {}
+  template <typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator) {
+    uniform_real_distribution<T> uniform(0.0, 1.0);
+    return transformation::cauchy<T>(uniform(generator), median, sigma);
+  }
+  private:
+    T median;
+    T sigma;
+};
+/**
+ * Samples a lognormal distribution
+ * Takes mean and standard deviation as inputs
+ * Outputs two samples at a time
+ */
+template <typename T>
+struct lognormal_distribution {
+  C10_HOST_DEVICE inline lognormal_distribution(T mean_in, T stdv_in) : mean(mean_in), stdv(stdv_in) {
+    TORCH_CHECK_IF_NOT_ON_CUDA(stdv_in > 0);
+  }
+  template<typename RNG>
+  C10_HOST_DEVICE inline T operator()(RNG generator){
+    normal_distribution<T> normal(mean, stdv);
+    return transformation::log_normal<T>(normal(generator));
+  }
+  private:
+    T mean;
+    T stdv;
+};
+}
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Formatting.h ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+#include <ostream>
+#include <string>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Tensor.h>
+namespace c10 {
+TORCH_API std::ostream& operator<<(std::ostream& out, Backend b);
+TORCH_API std::ostream& operator<<(std::ostream & out, const Scalar& s);
+TORCH_API std::string toString(const Scalar& s);
+}
+namespace at {
+TORCH_API std::ostream& operator<<(std::ostream& out, const DeprecatedTypeProperties& t);
+TORCH_API std::ostream& print(
+    std::ostream& stream,
+    const Tensor& tensor,
+    int64_t linesize);
+inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
+  return print(out,t,80);
+}
+TORCH_API void print(const Tensor & t, int64_t linesize=80);
+}

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Generator.h ADDED Viewed

	@@ -0,0 +1,191 @@

+#pragma once
+#include <cstdint>
+#include <deque>
+#include <mutex>
+#include <utility>
+#include <c10/util/Exception.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/core/Device.h>
+#include <c10/core/DispatchKeySet.h>
+// For the record I don't think this is a correct pimpl idiom.
+// Including Impl header in interface header defeats the purpose
+// because you can't change Impl private members without forcing
+// everything that included the interface to rebuild.
+// Impl should be forward-declared in the interface header instead.
+#include <c10/core/GeneratorImpl.h>
+/**
+ * Note [Generator]
+ * ~~~~~~~~~~~~~~~~
+ * A Pseudo Random Number Generator (PRNG) is an engine that uses an algorithm to
+ * generate a seemingly random sequence of numbers, that may be later be used in creating
+ * a random distribution. Such an engine almost always maintains a state and requires a
+ * seed to start off the creation of random numbers. Often times, users have
+ * found it beneficial to be able to explicitly create, retain, and destroy
+ * PRNG states and also be able to have control over the seed value.
+ *
+ * A Generator in ATen gives users the ability to read, write and modify a PRNG engine.
+ * For instance, it does so by letting users seed a PRNG engine, fork the state of the
+ * engine, etc.
+ *
+ * By default, there is one generator per device, and a device's generator is
+ * lazily created. A user can use the torch.Generator() api to create their own generator.
+ */
+/**
+ * Note [Acquire lock when using random generators]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Generator and its derived classes are NOT thread-safe. Please note that most of the
+ * places where we have inserted locking for generators are historically based, and we
+ * haven't actually checked that everything is truly thread safe (and it probably isn't).
+ * Please use the public mutex_ when using any methods from these classes, except for the
+ * read-only methods. You can learn about the usage by looking into the unittests
+ * (aten/src/ATen/cpu_generator_test.cpp) and other places where we have used lock_guard.
+ *
+ * TODO: Look into changing the threading semantics of Generators in ATen (e.g., making
+ * them non-thread safe and instead making the generator state splittable, to accommodate
+ * forks into other threads).
+ */
+namespace at {
+class Tensor;
+struct TORCH_API Generator {
+  Generator() = default;
+  explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
+   : impl_(std::move(gen_impl)) {
+    if (impl_.get() == nullptr) {
+      throw std::runtime_error("GeneratorImpl with nullptr is not supported");
+    }
+  }
+  bool operator==(const Generator& rhs) const {
+    return this->impl_ == rhs.impl_;
+  }
+  bool operator!=(const Generator& rhs) const {
+    return !((*this) == rhs);
+  }
+  bool defined() const {
+    return static_cast<bool>(impl_);
+  }
+  c10::GeneratorImpl* unsafeGetGeneratorImpl() const {
+    return impl_.get();
+  }
+  c10::GeneratorImpl* unsafeReleaseGeneratorImpl() {
+    return impl_.release();
+  }
+  const c10::intrusive_ptr<c10::GeneratorImpl>& getIntrusivePtr() const {
+    return impl_;
+  }
+  void set_current_seed(uint64_t seed) { impl_->set_current_seed(seed); }
+  // Sets the offset of Generator state to the desired offset. This is currently
+  // supported for only Philox based Generators, i.e., CUDA and MPS.
+  void set_offset(uint64_t offset) { impl_->set_offset(offset); }
+  // Returns the offset of Generator state. This is currently supported for only
+  // Philox based Generators, i.e., CUDA and MPS.
+  uint64_t get_offset() const { return impl_->get_offset(); }
+  uint64_t current_seed() const { return impl_->current_seed(); }
+  uint64_t seed() { return impl_->seed(); }
+  // Implementation not inlined to prevent cycle reference between
+  // `ATen/core/Generator.h` and `ATen/core/Tensor.h`
+  void set_state(const at::Tensor& new_state);
+  at::Tensor get_state() const;
+  void graphsafe_set_state(const Generator& new_state);
+  Generator graphsafe_get_state() const;
+  std::mutex& mutex() {
+    return impl_->mutex_;
+  }
+  DispatchKeySet key_set() const {
+    return impl_->key_set();
+  }
+  Device device() const { return impl_->device(); }
+  inline void set_pyobj(PyObject* pyobj) const noexcept {
+    impl_->set_pyobj(pyobj);
+  }
+  inline PyObject* pyobj() const noexcept {
+    return impl_->pyobj();
+  }
+  template<typename T>
+  T* get() const { return static_cast<T*>(impl_.get()); }
+  Generator clone() const {
+    return Generator(impl_->clone());
+  }
+ private:
+  c10::intrusive_ptr<c10::GeneratorImpl> impl_;
+};
+template<class Impl, class... Args>
+Generator make_generator(Args&&... args) {
+  return Generator(c10::make_intrusive<Impl>(std::forward<Args>(args)...));
+}
+/**
+ * Utility function to static cast input Generator* to
+ * the backend generator type (CPU/CUDAGeneratorImpl etc.)
+ */
+template <typename T>
+inline T * check_generator(std::optional<Generator> gen) {
+  TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt");
+  TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed");
+  TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'");
+  return gen->get<T>();
+}
+/**
+ * Utility function used in tensor implementations, which
+ * supplies the default generator to tensors, if an input generator
+ * is not supplied. The input Generator* is also static casted to
+ * the backend generator type (CPU/CUDAGeneratorImpl etc.)
+ */
+template <typename T>
+inline T* get_generator_or_default(const std::optional<Generator>& gen, const Generator& default_gen) {
+  return gen.has_value() && gen->defined() ? check_generator<T>(gen) : check_generator<T>(default_gen);
+}
+namespace detail {
+/**
+ * Helper function for checking the validity of new random generator
+ * state. Right now following conditions are checked:
+ *
+ * - The new state tensor must be a torch.ByteTensor
+ * - Data of the new state tensor must be contiguous
+ */
+inline void check_rng_state(const c10::TensorImpl& new_state) {
+  TORCH_CHECK_TYPE(
+    new_state.layout() == kStrided && new_state.device().type() == kCPU && new_state.dtype() == kByte,
+    "RNG state must be a torch.ByteTensor"
+  );
+  TORCH_CHECK(new_state.is_contiguous(), "RNG state must be contiguous");
+}
+} // namespace detail
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/GeneratorForPrivateuseone.h ADDED Viewed

	@@ -0,0 +1,39 @@

+#pragma once
+#include <ATen/core/Generator.h>
+#include <c10/util/intrusive_ptr.h>
+namespace at {
+using GeneratorFuncType = std::function<at::Generator(c10::DeviceIndex)>;
+TORCH_API std::optional<GeneratorFuncType>& GetGeneratorPrivate();
+class TORCH_API _GeneratorRegister {
+ public:
+  explicit _GeneratorRegister(const GeneratorFuncType& func);
+};
+TORCH_API at::Generator GetGeneratorForPrivateuse1(
+    c10::DeviceIndex device_index);
+/**
+ * This is used to register Generator to PyTorch for `privateuse1` key.
+ *
+ * Usage: REGISTER_GENERATOR_PRIVATEUSE1(MakeGeneratorForPrivateuse1)
+ *
+ * class CustomGeneratorImpl : public c10::GeneratorImpl {
+ *   CustomGeneratorImpl(DeviceIndex device_index = -1);
+ *   explicit ~CustomGeneratorImpl() override = default;
+ *   ...
+ * };
+ *
+ * at::Generator MakeGeneratorForPrivateuse1(c10::DeviceIndex id) {
+ *   return at::make_generator<CustomGeneratorImpl>(id);
+ * }
+ */
+#define REGISTER_GENERATOR_PRIVATEUSE1(GeneratorPrivate) \
+  static auto temp##GeneratorPrivate = at::_GeneratorRegister(GeneratorPrivate);
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/IListRef.h ADDED Viewed

	@@ -0,0 +1,631 @@

+#pragma once
+#include <ATen/core/ivalue_to.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <functional>
+#include <initializer_list>
+#include <iterator>
+#include <type_traits>
+/*
+ * [Note: IListRef]
+ * Wrapper around different API containers (e.g. boxed and unboxed).
+ *
+ * What is it?
+ * ===========
+ * It is a tagged union of both boxed and unboxed API containers.
+ * Working implementations:
+ *
+ * - `IListRef<at::Tensor>`
+ * - `IListRef<at::OptionalTensorRef>`
+ *
+ * Note that `IListRef` is a view type. Meaning that it won't own the
+ * tensors it holds. It's intended to be used only as argument parameters.
+ * Specifically, where these 2 worlds overlap.
+ *
+ * What is this for?
+ * =================
+ * Historically, PyTorch has maintained 2 different APIs: the unboxed
+ * (called from C++ API and Python eager mode) and boxed APIs (called
+ * from the TorchScript JIT, mobile interpreter, and boxed fallbacks).
+ *
+ * Calling unboxed kernels from the boxed "world" and vice-versa may
+ * result in non-negligible overhead. Lists are one of those types:
+ *
+ * - Boxed world: `c10::List`
+ * - Unboxed world: `c10::ArrayRef`
+ *
+ * In this context, `c10::IListRef` solves this problem by wrapping those
+ * 2 container types, so that we don't need to convert from one to
+ * the other.
+ *
+ * (see https://github.com/pytorch/pytorch/issues/66328)
+ *
+ * What does it do?
+ * ================
+ * This container wraps around the different tagged containers
+ * (currently, only boxed and unboxed), without incurring in extra
+ * overhead for converting from one to another. It does so while
+ * exposing usual container methods, which dispatch to corresponding
+ * implementations.
+ *
+ * While it works with different container types, it introduces
+ * overhead for repeatedly calling member functions (since those will
+ * get dispatched, again). Therefore, you should only use it to iterate
+ * through the list up to one time. If you need to do more complex things,
+ * call `materialize()` first.
+ *
+ * Adding support for a new Tag
+ * ============================
+ * Suppose we want to add a new tag: `Chest`. Here are the steps
+ * we would have to go through:
+ *
+ * 1. Add a line for it in the macro `TORCH_ILISTREF_FORALL_TAGS`.
+ *
+ *   #define TORCH_ILISTREF_FORALL_TAGS(_, ...) \
+ *     ...
+ *     _(Chest, ##__VA_ARGS__)
+ *
+ * 2. Add type aliases, union members, and constructors.
+ *
+ *   template <typename T>
+ *   class IListRef {
+ *     ...
+ *     using chest_type =
+ *       typename detail::IListRefTagImpl<T, IListRefTag::Chest>::list_type;
+ *     ...
+ *     IListRef(...) : tag_(IListRefTag::Chest) {
+ *       ...
+ *     }
+ *     ...
+ *     union Payload {
+ *       ...
+ *       chest_type chest;
+ *       ...
+ *     };
+ *     ...
+ *   };
+ *
+ * 3. Add a default implementation for it (in 'IListRef_inl.h'). It's
+ *    preferable to make the default implementation work for `T = Tensor`
+ *    (both `Unboxed` and `Boxed` do it).
+ *
+ *   template <typename T, typename ListElemT>
+ *   class IListRefTagImplBase<IListRefTag::Chest, T, ListElemT> {
+ *    public:
+ *     using elem_type = ListElemT;
+ *     using list_type = ChestContainer<elem_type>;
+ *
+ *     static const list_type& unwrap(const IListRef<T>& ilist) { ... }
+ *
+ *     static typename list_type::const_iterator& unwrap(
+ *         IListRefIterator<T>& it) { ... }
+ *
+ *     static const typename list_type::const_iterator& unwrap(
+ *         const IListRefIterator<T>& it) { ... }
+ *
+ *     static IListRefConstRef<T> iterator_get(
+ *         const typename list_type::const_iterator& it) { ... }
+ *   }
+ *
+ * 4. Add an specialization for each of the already supported types.
+ *    Finally, for consistency, add them to the tracking list.
+ *    (see [Note: IListRefTagImpl Specializations])
+ *
+ *   template <>
+ *   class IListRefTagImpl<IListRefTag::Chest, at::Tensor>
+ *       : public IListRefTagImplBase<IListRefTag::Chest, at::Tensor> {};
+ *
+ * Adding support for a new Type
+ * =============================
+ * Suppose we want to add support for a new type: `Matrix`.
+ * Here are the steps we would have to go through:
+ *
+ * 1. Add an specialization for each of the existing tags.
+ *    For consistency, add them to the tracking list.
+ *    (see [Note: IListRefTagImpl Specializations])
+ *
+ *   template <>
+ *   class IListRefTagImpl<IListRefTag::Unboxed, Matrix>
+ *       : public IListRefTagImplBase<IListRefTag::Unboxed, Matrix> {};
+ *
+ *   template <>
+ *   class IListRefTagImpl<Matrix, IListRefTag::Boxed>
+ *       : public IListRefTagImplBase<IListRefTag::Boxed, Matrix> {};
+ *
+ * Common Problems
+ * ===============
+ * 1. One of `IListRef(Iterator)` methods are failing to compile.
+ *
+ *     That may be happening because the container type you added
+ *     is not compatible with the code written for that method. If
+ *     that's true, then you might have to transform that code into
+ *     a static method call (see `List::operator[]` method).
+ *
+ * 2. Can't make `IListRefIterator<T>::operator*` return a const-reference.
+ *
+ *    First, keep in mind that we assume that boxed containers will
+ *    have to deal with `IValue` (e.g. `c10::List`). In this context,
+ *    what may be happening is that `IValue` doesn't store internally
+ *    your type `T`. Instead, it constructs a type new `T` everytime
+ *    you try to get `T` for it (see `IListRef<at::OptinalTensorRef>`).
+ */
+namespace c10 {
+template <typename T>
+class IListRef;
+/*
+ * Applies arbitrary macros to each `IListRefTag`.
+ */
+#define TORCH_ILISTREF_FORALL_TAGS(_, ...) \
+  _(Unboxed, ##__VA_ARGS__)                \
+  _(Boxed, ##__VA_ARGS__)                  \
+  _(Materialized, ##__VA_ARGS__)
+/*
+ * Defines a "switch-case" for `TAG`. Inside, it executes `BODY`,
+ * while bringing to scope:
+ *
+ * - `ImplT`: the implementation class for `TAG`
+ * - `this_`: the result of unwrapping `this`
+ */
+#define TORCH_ILISTREF_UNWRAP_CASE(TAG, BODY)                        \
+  case c10::IListRefTag::TAG: {                                      \
+    using ImplT = c10::detail::IListRefTagImpl<IListRefTag::TAG, T>; \
+    auto& this_ = ImplT::unwrap(*this);                              \
+    BODY                                                             \
+  } break;
+/*
+ * Dispatches the unwrap call, depending on `TAG`, followed by
+ * the execution of `BODY`. It aborts if `TAG` is not a `IListRefTag`.
+ *
+ * This macro is useful because it allows us to handle different
+ * types (that correspond to different tags) to be implemented
+ * only once. We can do it even when the implementation of the
+ * different tags aren't syntatically the same, by dispatching
+ * it to a function (e.g. `ImplT::<dispatch-function>(this_)`).
+ */
+#define TORCH_ILISTREF_UNWRAP(TAG, BODY)                         \
+  switch (TAG) {                                                 \
+    TORCH_ILISTREF_FORALL_TAGS(TORCH_ILISTREF_UNWRAP_CASE, BODY) \
+    break;                                                       \
+    default:                                                     \
+      TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");     \
+  }
+enum class IListRefTag {
+#define DEFINE_TAG(tag, ...) tag,
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_TAG)
+#undef DEFINE_TAG
+      None
+};
+namespace detail {
+/*
+ * Type alias that specifies whether we return a reference or a copy of `T`.
+ *
+ * What is this for?
+ * =================
+ * Since values in the boxed world are represented by an `IValue`, we also
+ * depend on whether it can be converted to a const-reference (`Tensor`) or
+ * has to create a new copy of `T` (`OptionalTensorRef`).
+ */
+template <typename T>
+using IListRefConstRef = typename ivalue_to_const_ref_overload_return<T>::type;
+/*
+ * Interface that implements key functions for each `IListRefTag` type.
+ *
+ * What is this for?
+ * =================
+ * Given an `IListRef(Iterator)<T>`, some methods have to be implemented
+ * differently for each `TAG`. Therefore, the methods inside this class
+ * are used as dispatch targets for the different `IListRefTag` values.
+ *
+ * You should create an specialization of this class for each possible
+ * combination of `IListRefTag` type (except `None`) and element types
+ * (e.g. `Tensor`).
+ *
+ * What does it do?
+ * ================
+ * 1. defines static methods to be used as dispatch targets by both
+ *    `IListRef<T>` and `IListRefIterator<T>` (see the implementation of
+ *    `IListRefTagImplBase`).
+ *
+ * 2. defines the `elem_type` and `list_type` aliases that will be
+ *    used in the definition of `IListRef<T>`. In general, we should do
+ *    so by inheriting from `IListRefTagImplBase<TAG, T, ListElemT>`.
+ *
+ * [Note: IListRefTagImpl Specialization]
+ * ======================================
+ * For `IListRef(Iterator)<at::Tensor>`:
+ * - <IListRefTag::Unboxed, at::Tensor>
+ * - <IListRefTag::Boxed, at::Tensor>
+ * - <IListRefTag::Materialized, at::Tensor>
+ *
+ * For `IListRef(Iterator)<at::OptionalTensorRef>`:
+ * - <IListRefTag::Unboxed, at::OptionalTensorRef>
+ * - <IListRefTag::Boxed, at::OptionalTensorRef>
+ * - <IListRefTag::Materialized, at::OptionalTensorRef>
+ */
+template <IListRefTag TAG, typename T>
+class IListRefTagImpl {};
+/*
+ * Base implementation of `IListRefTagImpl<TAG, T>` methods.
+ *
+ * What is this for?
+ * =================
+ * This should make adding specializations for new types easier. For
+ * example, one should be able to add a new type just by making its
+ * `IListRefTagImpl` specialization inherit from `IListRefTagImplBase`.
+ *
+ * You should create a partial specialization for this class only if
+ * you introduce a new `IListRefTag`. The idea being that there is one
+ * default implementation for each possible value of `IListRefTag`.
+ *
+ * What does it do?
+ * ================
+ * 1. defines `elem_type` as an alias to `ListElemT`.
+ *
+ * 1. defines `list_type` as an alias to the default container type
+ *    that will hold a collection of `elem_type`. The idea being that
+ *    all types tagged as `TAG` will have `list_type` as its container,
+ *    with different `elem_type`.
+ *
+ * 3. defines the default implementation for each of the methods that
+ *    are supposed to be defined on `IListRefTagImpl` specializations.
+ *
+ * 4. inheriting from `IListRefTagImplBase<TAG, T, ListElemT>` also means
+ *    that the payload of the type `IListRef<T>` will be of type `list_type`
+ *    when it is tagged as `TAG`.
+ */
+template <IListRefTag TAG, typename T, typename ListElemT = T>
+class IListRefTagImplBase {};
+/*
+ * Materialized container for `IListRef<T>`.
+ *
+ * What is this for?
+ * =================
+ * Container that groups `T` references together. This exchanges the
+ * overhead of every method call from `IListRef<T>` for a dynamic allocation.
+ *
+ * You should use this container instead of `IListRef<T>` if:
+ *
+ *   - You are going to iterate the list more than once
+ *   - You need to repeatedly access arbitrary elements (using `operator[]`)
+ * What does it do?
+ * ================
+ * Removes the reference (&) from the type, and wraps it into a
+ * `std::reference_wrapper`. If `IListRefConstRef<T>` is not a
+ * reference type, then it's left unchanged.
+ */
+template <typename T>
+using _MaterializedIListRefElem = std::conditional_t<
+    std::is_reference_v<T>,
+    typename std::reference_wrapper<std::remove_reference_t<T>>,
+    T>;
+template <typename T>
+using MaterializedIListRefElem = _MaterializedIListRefElem<IListRefConstRef<T>>;
+template <typename T>
+using MaterializedIListRef = std::vector<MaterializedIListRefElem<T>>;
+} // namespace detail
+/*
+ * Iterator for `IListRef<T>`.
+ *
+ * What is it?
+ * ===========
+ * Currently, a `std::bidirectional_iterator` that wraps the iterator
+ * types defined for each of the `IListRefTag`.
+ *
+ * One should be able to use it, as if it were the unwrapped
+ * iterators themselves.
+ * What does it do?
+ * ================
+ * Similarly to `IListRef<T>`, this is a wrapper class. Specifically, it
+ * wraps each container's `const_iterator` type alias. So, for example,
+ * given that the container for `IListRefTag::Boxed` is `c10::List`, this
+ * iterator will wrap a `c10::List::const_iterator`.
+ *
+ * [Note: MSVC Iterator Debug]
+ * ===========================
+ * MSVC `vector<T>::iterator` implementation (used in the boxed variant)
+ * makes it so this union's destructor, copy-constructor (assignment), and
+ * move-constructor (assignment) are implicitly deleted.
+ *
+ * Therefore, we need to explicitly define them as needed. Follows a list
+ * of places where these are needed and their reason:
+ *
+ *   - `Payload` destructor:
+ *     it is deleted only if the macro `_ITERATOR_DEBUG_LEVEL` is set to 2.
+ *
+ *   - `IListRefIterator` destructor:
+ *     same as above. However, we need to explicitly call the variant
+ *     destructor explicitly.
+ *
+ *   - `IListRefIterator` copy-constructor:
+ *     it is deleted only if the macro `_ITERATOR_DEBUG_LEVEL` is different
+ *     than 0.
+ */
+template <typename T>
+class IListRefIterator {
+ private:
+#define DEFINE_FRIEND_CLASS(TAG, ...)                        \
+  friend class detail::IListRefTagImpl<IListRefTag::TAG, T>; \
+  friend class detail::IListRefTagImplBase<                  \
+      IListRefTag::TAG,                                      \
+      T,                                                     \
+      typename detail::IListRefTagImpl<IListRefTag::TAG, T>::elem_type>;
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_FRIEND_CLASS)
+#undef DEFINE_FRIEND_CLASS
+ public:
+  // C++17 friendly std::iterator implementation
+  using iterator_category = std::bidirectional_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T*;
+  using reference = T&;
+  using unboxed_iterator_type = typename detail::
+      IListRefTagImpl<IListRefTag::Unboxed, T>::list_type::const_iterator;
+  using boxed_iterator_type = typename detail::
+      IListRefTagImpl<IListRefTag::Boxed, T>::list_type::const_iterator;
+  using materialized_iterator_type =
+      typename detail::MaterializedIListRef<T>::const_iterator;
+  IListRefIterator() : tag_(IListRefTag::None) {}
+#if defined(_MSC_VER) && _ITERATOR_DEBUG_LEVEL != 0
+  // See [Note: MSVC Iterator Debug]
+  IListRefIterator(const IListRefIterator& iterator)
+      : tag_(iterator.tag_) {
+    switch (tag_) {
+      case IListRefTag::Boxed:
+        payload_.boxed_iterator = iterator.payload_.boxed_iterator;
+        break;
+      case IListRefTag::Unboxed:
+        payload_.unboxed_iterator = iterator.payload_.unboxed_iterator;
+        break;
+      case IListRefTag::Materialized:
+        payload_.materialized_iterator = iterator.payload_.materialized_iterator;
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");
+    }
+  }
+#endif
+#if defined(_MSC_VER) && _ITERATOR_DEBUG_LEVEL == 2
+  // See [Note: MSVC Iterator Debug]
+  ~IListRefIterator() noexcept(false) {
+    switch (tag_) {
+      case IListRefTag::Boxed:
+        payload_.boxed_iterator.~boxed_iterator_type();
+        break;
+      case IListRefTag::Unboxed:
+        payload_.unboxed_iterator.~unboxed_iterator_type();
+        break;
+      case IListRefTag::Materialized:
+        payload_.materialized_iterator.~materialized_iterator_type();
+        break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "invalid IListRef tag.");
+    }
+  }
+#endif
+  IListRefIterator(boxed_iterator_type boxed) : tag_(IListRefTag::Boxed) {
+    payload_.boxed_iterator = boxed;
+  }
+  IListRefIterator(unboxed_iterator_type unboxed) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed_iterator = unboxed;
+  }
+  IListRefIterator(materialized_iterator_type materialized) : tag_(IListRefTag::Materialized) {
+    payload_.materialized_iterator = materialized;
+  }
+  detail::IListRefConstRef<T> operator*() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return ImplT::iterator_get(this_); });
+  }
+  IListRefIterator& operator++() {
+    TORCH_ILISTREF_UNWRAP(tag_, { ++this_; });
+    return *this;
+  }
+  IListRefIterator operator++(int) {
+    auto old = *this;
+    TORCH_ILISTREF_UNWRAP(tag_, { ++this_; });
+    return old;
+  }
+  IListRefIterator& operator--() {
+    TORCH_ILISTREF_UNWRAP(tag_, { --this_; });
+    return *this;
+  }
+  IListRefIterator operator--(int) {
+    auto old = *this;
+    TORCH_ILISTREF_UNWRAP(tag_, { --this_; });
+    return old;
+  }
+  bool operator==(const IListRefIterator& rhs) const {
+    if (tag_ != rhs.tag_) {
+      return false;
+    }
+    TORCH_ILISTREF_UNWRAP(tag_, {
+      auto& rhs_it = ImplT::unwrap(rhs);
+      return this_ == rhs_it;
+    });
+  }
+  bool operator!=(const IListRefIterator& rhs) const {
+    return !(*this == rhs);
+  }
+ private:
+  union Payload {
+    boxed_iterator_type boxed_iterator;
+    unboxed_iterator_type unboxed_iterator;
+    materialized_iterator_type materialized_iterator;
+    void* _init_ptr;
+    Payload() : _init_ptr(nullptr) {}
+#if defined(_MSC_VER)
+    // See [Note: MSVC Iterator Debug]
+    ~Payload() {}
+#endif
+  };
+  Payload payload_;
+  IListRefTag tag_;
+};
+/*
+ * See [Note: IListRef]
+ */
+template <typename T>
+class IListRef {
+ private:
+#define DEFINE_FRIEND_CLASS(TAG, ...)                        \
+  friend class detail::IListRefTagImpl<IListRefTag::TAG, T>; \
+  friend class detail::IListRefTagImplBase<                  \
+      IListRefTag::TAG,                                      \
+      T,                                                     \
+      typename detail::IListRefTagImpl<IListRefTag::TAG, T>::elem_type>;
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_FRIEND_CLASS)
+#undef DEFINE_FRIEND_CLASS
+ public:
+  using unboxed_type =
+      typename detail::IListRefTagImpl<IListRefTag::Unboxed, T>::list_type;
+  using boxed_type =
+      typename detail::IListRefTagImpl<IListRefTag::Boxed, T>::list_type;
+  using materialized_type =
+      typename detail::MaterializedIListRef<T>;
+  using iterator = IListRefIterator<T>;
+  using const_iterator = IListRefIterator<T>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using value_type = typename iterator::value_type;
+  IListRef() : tag_(IListRefTag::None) {}
+  IListRef(const boxed_type& boxed) : tag_(IListRefTag::Boxed) {
+    payload_.boxed = &boxed;
+  }
+  IListRef(const unboxed_type& unboxed) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed = unboxed;
+  }
+  IListRef(const std::initializer_list<T>& list) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed = at::ArrayRef<T>(list);
+  }
+  template <
+      typename... UnboxedConstructorArgs,
+      typename = std::enable_if_t<
+          std::is_constructible_v<unboxed_type, UnboxedConstructorArgs...>>>
+  IListRef(UnboxedConstructorArgs&&... args) : tag_(IListRefTag::Unboxed) {
+    payload_.unboxed = unboxed_type(std::forward<UnboxedConstructorArgs>(args)...);
+  }
+  IListRef(const materialized_type& materialized) : tag_(IListRefTag::Materialized) {
+    payload_.materialized = &materialized;
+  }
+  size_t size() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return this_.size(); });
+  }
+  bool empty() const {
+    return size() == 0;
+  }
+  iterator begin() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return this_.begin(); });
+  }
+  iterator end() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return this_.end(); });
+  }
+  detail::IListRefConstRef<T> front() const {
+    TORCH_ILISTREF_UNWRAP(tag_, { return ImplT::front(this_); });
+  }
+  /*
+   * Materializes the `IListRef` into a `std::vector`.
+   *
+   * This should be used when one wishes to either:
+   *
+   *   - iterate over the list more than once: each `IListRefIterator`
+   *     member function call has to go through a switch, introducing
+   *     non-negligible overhead
+   *
+   *   - randomly access an arbitrary element using `operator[]`:
+   *     same reason as above
+   */
+  detail::MaterializedIListRef<T> materialize() const {
+    if (isMaterialized()) {
+      return toMaterialized();
+    }
+    detail::MaterializedIListRef<T> materialized;
+    materialized.reserve(size());
+    for (const auto& t : *this) {
+      materialized.emplace_back(t);
+    }
+    return materialized;
+  }
+#define DEFINE_CHECK(TAG, ...)    \
+  bool is##TAG() const {          \
+    return tag_ == IListRefTag::TAG; \
+  }
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_CHECK)
+#undef DEFINE_CHECK
+  bool isNone() const {
+    return tag_ == IListRefTag::None;
+  }
+#define DEFINE_CASTING(TAG, ...)                                          \
+  const typename detail::IListRefTagImpl<IListRefTag::TAG, T>::list_type& \
+      to##TAG() const {                                                   \
+    TORCH_INTERNAL_ASSERT(is##TAG());                                     \
+    return detail::IListRefTagImpl<IListRefTag::TAG, T>::unwrap(*this);   \
+  }
+  TORCH_ILISTREF_FORALL_TAGS(DEFINE_CASTING)
+#undef DEFINE_CASTING
+ private:
+  union Payload {
+    const boxed_type* boxed;
+    unboxed_type unboxed;
+    const materialized_type* materialized;
+    Payload() : boxed(nullptr) {}
+  };
+  Payload payload_;
+  IListRefTag tag_;
+};
+} // namespace c10
+#include <ATen/core/IListRef_inl.h>

.venv/lib/python3.12/site-packages/torch/include/ATen/core/IListRef_inl.h ADDED Viewed

	@@ -0,0 +1,203 @@

+#pragma once
+#include <ATen/core/List.h>
+#include <ATen/core/Tensor.h>
+namespace at {
+class Tensor;
+class OptionalTensorRef;
+}
+namespace c10::detail {
+/*
+ * Specializations of `IListRefTagImplBase` that implement the default
+ * implementation for `IListRefTag::Unboxed`.
+ */
+template <typename T, typename ListElemT>
+class IListRefTagImplBase<IListRefTag::Unboxed, T, ListElemT> {
+ public:
+  using elem_type = ListElemT;
+  using list_type = ArrayRef<elem_type>;
+  /*
+   * These `unwrap` static methods unwraps the inner containers out
+   * of `IListRef<T>` (and `IListRefIterator<T>`). They are required when
+   * the macro `TORCH_ILISTREF_UNWRAP` is called.
+   */
+  static const list_type& unwrap(const IListRef<T>& ilist) {
+    return ilist.payload_.unboxed;
+  }
+  static typename list_type::const_iterator& unwrap(IListRefIterator<T>& it) {
+    return it.payload_.unboxed_iterator;
+  }
+  static const typename list_type::const_iterator& unwrap(
+      const IListRefIterator<T>& it) {
+    return it.payload_.unboxed_iterator;
+  }
+  /*
+   * We have these function (besides the `unwrap`s above) because the
+   * implementation for both `IListRef::operator[]` and `IListRefIterator::operator*`
+   * weren't syntatically equal for the existing tags at the time
+   * (`Unboxed` and `Boxed`).
+   */
+  static IListRefConstRef<T> front(const list_type& lst) {
+    return lst.front();
+  }
+  static IListRefConstRef<T> iterator_get(
+      const typename list_type::const_iterator& it) {
+    return *it;
+  }
+};
+/*
+ * Specializations of `IListRefTagImplBase` that implement the default
+ * implementation for `IListRefTag::Boxed`.
+ */
+template <typename T, typename ListElemT>
+class IListRefTagImplBase<IListRefTag::Boxed, T, ListElemT> {
+ public:
+  using elem_type = ListElemT;
+  using list_type = List<elem_type>;
+  static const list_type& unwrap(const IListRef<T>& ilist) {
+    return *ilist.payload_.boxed;
+  }
+  static typename list_type::const_iterator& unwrap(IListRefIterator<T>& it) {
+    return it.payload_.boxed_iterator;
+  }
+  static const typename list_type::const_iterator& unwrap(
+      const IListRefIterator<T>& it) {
+    return it.payload_.boxed_iterator;
+  }
+  static IListRefConstRef<T> front(const list_type& lst) {
+    return lst[0];
+  }
+  static IListRefConstRef<T> iterator_get(
+      const typename list_type::const_iterator& it) {
+    return (*it).get().toTensor();
+  }
+};
+/*
+ * Specializations of `IListRefTagImplBase` that implement the default
+ * implementation for `IListRefTag::Materialized`.
+ */
+template <typename T>
+class IListRefTagImplBase<IListRefTag::Materialized, T, MaterializedIListRefElem<T>> {
+ public:
+  using elem_type = MaterializedIListRefElem<T>;
+  using list_type = MaterializedIListRef<T>;
+  static const list_type& unwrap(const IListRef<T>& ilist) {
+    return *ilist.payload_.materialized;
+  }
+  static typename list_type::const_iterator& unwrap(IListRefIterator<T>& it) {
+    return it.payload_.materialized_iterator;
+  }
+  static const typename list_type::const_iterator& unwrap(
+      const IListRefIterator<T>& it) {
+    return it.payload_.materialized_iterator;
+  }
+  static IListRefConstRef<T> front(const list_type& lst) {
+    return lst[0];
+  }
+  static IListRefConstRef<T> iterator_get(
+      const typename list_type::const_iterator& it) {
+    return *it;
+  }
+};
+/*
+ * [Note: ITensorListRef]
+ * Specializations necessary for `IListRef<at::Tensor>` type.
+ *
+ * Since the default implementations are usually done with supporting
+ * `Tensor` in mind, we only have to inherit from the base implementations.
+ */
+template <>
+class IListRefTagImpl<IListRefTag::Unboxed, at::Tensor>
+    : public IListRefTagImplBase<IListRefTag::Unboxed, at::Tensor> {};
+template <>
+class IListRefTagImpl<IListRefTag::Boxed, at::Tensor>
+    : public IListRefTagImplBase<IListRefTag::Boxed, at::Tensor> {};
+template <>
+class IListRefTagImpl<IListRefTag::Materialized, at::Tensor>
+    : public IListRefTagImplBase<
+          IListRefTag::Materialized,
+          at::Tensor,
+          MaterializedIListRefElem<at::Tensor>> {};
+/*
+ * [Note: IOptTensorListRef]
+ * Specializations necessary for `IListRef<at::OptionalTensorRef>` type.
+ *
+ * We can't get an `at::OptionalTensorRef` directly from an instance of
+ * `List<optional<Tensor>>` (the type that corresponds to the boxed world).
+ *
+ * So, the default implementation won't help us. Thus, we have to implement
+ * this method ourselves.
+ */
+template <>
+class IListRefTagImpl<IListRefTag::Unboxed, at::OptionalTensorRef>
+    : public IListRefTagImplBase<IListRefTag::Unboxed, at::OptionalTensorRef> {};
+template <>
+class IListRefTagImpl<IListRefTag::Boxed, at::OptionalTensorRef>
+    : public IListRefTagImplBase<IListRefTag::Boxed, at::OptionalTensorRef, std::optional<at::Tensor>> {
+ public:
+  /*
+   * Given an instance of the types corresponding to the `Boxed` tag, we override
+   * the default implementation, so that we can return a `at::OptionalTensorRef`.
+   */
+  static IListRefConstRef<at::OptionalTensorRef> iterator_get(
+      const typename list_type::const_iterator& it) {
+    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdangling-reference")
+    const auto& ivalue = (*it).get();
+    C10_DIAGNOSTIC_POP()
+    if (!ivalue.isNone()) {
+        const auto& tensor = ivalue.toTensor();
+        return (tensor.defined()) ? tensor : at::OptionalTensorRef{};
+    }
+    return {};
+  }
+};
+template <>
+class IListRefTagImpl<IListRefTag::Materialized, at::OptionalTensorRef>
+    : public IListRefTagImplBase<
+          IListRefTag::Materialized,
+          at::OptionalTensorRef,
+          MaterializedIListRefElem<at::OptionalTensorRef>> {};
+} // namespace c10::detail
+namespace at {
+// [Note: ITensorListRef]
+using ITensorListRef = c10::IListRef<at::Tensor>;
+using ITensorListRefIterator = c10::IListRefIterator<at::Tensor>;
+using MaterializedITensorListRef = c10::detail::MaterializedIListRef<at::Tensor>;
+// [Note: IOptTensorListRef]
+using IOptTensorListRef = c10::IListRef<at::OptionalTensorRef>;
+using IOptTensorListRefIterator = c10::IListRefIterator<at::OptionalTensorRef>;
+using MaterializedIOptTensorListRef = c10::detail::MaterializedIListRef<at::OptionalTensorRef>;
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/LegacyTypeDispatch.h ADDED Viewed

	@@ -0,0 +1,111 @@

+#pragma once
+// The legacy mechanism for dispatching operators in ATen is a Type
+// object, which is essentially a giant virtual dispatch table
+// for every operation we support dynamically dispatching over.
+//
+// This has been deprecated in favor of ATenDispatch, and in the future,
+// c10 dispatcher.
+// TODO: Clean up what remains here
+#include <c10/core/impl/LocalDispatchKeySet.h>
+namespace at {
+// A RAII, thread local (!) guard that will disable dispatch to variable
+// handler.
+//
+// NOTE [ Treating Variables as non-Variables in type dispatch ]
+//
+// What exactly does AutoDispatchBelowAutograd do?  The short answer is, it causes
+// dispatches on ATen functions to go to the non-variable implementation,
+// bypassing autograd handling (and also profiling and tracing).
+//
+// To understand why this guard exists, it's helpful to understand the history
+// behind how Variable was implemented.  Previously, Variables were implemented
+// as a wrapper on Tensors; so the act of processing a Variable involved
+// unwrapping the underlying Tensor, and then calling the underlying base
+// operation on /that/ operation
+//
+// However, after the Variable/Tensor merge, there is no concept of unwrapping
+// a tensor anymore.  If you just call the operation on the same variable
+// again inside your VariableType handler, you'll dispatch back to
+// VariableType, which is not what we want.
+//
+// The solution to the above problem is to add `at::AutoDispatchBelowAutograd`, which
+// when enabled will cause `legacyTensorType()` and `getType()` to always return
+// non-Variable type, even if the tensor being called on is a variable.
+/* Note [AutoDispatchBelowAutograd]
+ * AutoDispatchBelowAutograd is **INTERNAL ONLY** that it should be used
+ * for kernel implementations and customized C++ kernels.
+ * If you are looking for a guard to run workload in inference mode, please use
+ * c10::InferenceMode RAII which is user facing API.
+ * In the past AutoDispatchBelowAutograd(or its old version AutoNonVariableTypeMode)
+ * was used in the user code for inference-only workload, this was under risk of
+ * producing wrong results silently in some edge cases. For example:
+ * ```
+ *  torch::Tensor s = torch::ones({1, 2, 3}).set_requires_grad(true);
+ *  torch::Tensor out = s * s;
+ *  {
+ *    at::AutoDispatchBelowAutograd guard;
+ *    s.add_(1);  // Skips version bump on `s`.
+ *  }
+ *  // WRONG GRADIENT! s.grad() are now computed using `s` value after the
+ *  // inplace update.
+ *  out.backward(torch::ones_like(out));
+ * ```
+ * Users should use `c10::InferenceMode` here so that it'll properly throw an
+ * error saying "one of the variables needed for gradient computation has be modified."
+ */
+struct TORCH_API AutoDispatchBelowAutograd {
+  AutoDispatchBelowAutograd() :
+    autograd_guard_(c10::autograd_dispatch_keyset) {
+  }
+  // disable all autograd dispatch keys
+  c10::impl::ExcludeDispatchKeyGuard autograd_guard_;
+};
+// TODO: AutoNonVariableTypeMode should be removed in release 1.10.
+struct TORCH_API AutoNonVariableTypeMode {
+  AutoNonVariableTypeMode(bool enabled = true) :
+    autograd_guard_(c10::autograd_dispatch_keyset) {
+    TORCH_WARN_ONCE("AutoNonVariableTypeMode is deprecated and will be removed in 1.10 release. "
+        "For kernel implementations please use AutoDispatchBelowADInplaceOrView instead, "
+        "If you are looking for a user facing API to enable running your inference-only "
+        "workload, please use c10::InferenceMode. Using AutoDispatchBelowADInplaceOrView in user code "
+        "is under risk of producing silent wrong result in some edge cases. "
+        "See Note [AutoDispatchBelowAutograd] for more details.");
+    TORCH_INTERNAL_ASSERT(enabled);
+  }
+  // disable all autograd dispatch keys
+  c10::impl::ExcludeDispatchKeyGuard autograd_guard_;
+};
+struct TORCH_API AutoDispatchSkipFunctionalize {
+  AutoDispatchSkipFunctionalize() :
+    dispatch_key_guard_(c10::DispatchKeySet(c10::DispatchKey::Functionalize)) {
+  }
+  c10::impl::ExcludeDispatchKeyGuard dispatch_key_guard_;
+};
+/* Note [AutoDispatchBelowADInplaceOrView]
+ * AutoDispatchBelowADInplaceOrView is equivalent to AutoNonVariableTypeMode
+ * before we split inplace & view ops out of VariableType kernel.
+ * Note this guard is used in VariableType kernels for functional ops
+ * as well as ADInplaceOrView kernels for inplace/view ops to enforce the
+ * Invariant:
+ *   Once you are in VariableType/ADInplaceOrView kernel for an op,
+ *   you never go back to a kernel on same dispatch key until
+ *   you finish the current op.
+ */
+struct TORCH_API AutoDispatchBelowADInplaceOrView {
+  AutoDispatchBelowADInplaceOrView() :
+    dispatch_key_guard_(c10::autograd_dispatch_keyset_with_ADInplaceOrView) {
+  }
+  // disable Autograd & ADInplaceOrView dispatch keys
+  c10::impl::ExcludeDispatchKeyGuard dispatch_key_guard_;
+};
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/List.h ADDED Viewed

	@@ -0,0 +1,491 @@

+#pragma once
+#include <ATen/core/ivalue_to.h>
+#include <ATen/core/jit_type_base.h>
+#include <c10/macros/Macros.h>
+#include <c10/macros/Export.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/ArrayRef.h>
+#include <optional>
+#include <vector>
+namespace at {
+class Tensor;
+}
+namespace c10 {
+struct IValue;
+template<class T> class List;
+struct Type;
+namespace detail {
+struct ListImpl final : public c10::intrusive_ptr_target {
+  using list_type = std::vector<IValue>;
+  explicit TORCH_API ListImpl(list_type list_, TypePtr elementType_);
+  list_type list;
+  TypePtr elementType;
+  intrusive_ptr<ListImpl> copy() const {
+    return make_intrusive<ListImpl>(list, elementType);
+  }
+  friend TORCH_API bool operator==(const ListImpl& lhs, const ListImpl& rhs);
+};
+}
+namespace impl {
+template<class T, class Iterator> class ListIterator;
+template<class T, class Iterator> class ListElementReference;
+template<class T, class Iterator>
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs) noexcept;
+template<class T, class Iterator>
+bool operator==(const ListElementReference<T, Iterator>& lhs, const T& rhs);
+template<class T, class Iterator>
+bool operator==(const T& lhs, const ListElementReference<T, Iterator>& rhs);
+template<class T>
+struct ListElementConstReferenceTraits {
+  // In the general case, we use IValue::to().
+  using const_reference = typename c10::detail::ivalue_to_const_ref_overload_return<T>::type;
+};
+// There is no to() overload for std::optional<std::string>.
+template<>
+struct ListElementConstReferenceTraits<std::optional<std::string>> {
+  using const_reference = std::optional<std::reference_wrapper<const std::string>>;
+};
+template<class T, class Iterator>
+class ListElementReference final {
+public:
+  operator std::conditional_t<
+      std::is_reference_v<typename c10::detail::
+                            ivalue_to_const_ref_overload_return<T>::type>,
+      const T&,
+      T>() const;
+  ListElementReference& operator=(T&& new_value) &&;
+  ListElementReference& operator=(const T& new_value) &&;
+  // assigning another ref to this assigns the underlying value
+  ListElementReference& operator=(ListElementReference&& rhs) && noexcept;
+  const IValue& get() const& {
+    return *iterator_;
+  }
+  friend void swap<T, Iterator>(ListElementReference&& lhs, ListElementReference&& rhs) noexcept;
+  ListElementReference(const ListElementReference&) = delete;
+  ListElementReference& operator=(const ListElementReference&) = delete;
+  ~ListElementReference() = default;
+private:
+  ListElementReference(Iterator iter)
+  : iterator_(iter) {}
+  // allow moving, but only our friends (i.e. the List class) can move us
+  ListElementReference(ListElementReference&&) noexcept = default;
+  ListElementReference& operator=(ListElementReference&& rhs) & noexcept {
+    iterator_ = std::move(rhs.iterator_);
+    return *this;
+  }
+  friend class List<T>;
+  friend class ListIterator<T, Iterator>;
+  Iterator iterator_;
+};
+// this wraps vector::iterator to make sure user code can't rely
+// on it being the type of the underlying vector.
+template <class T, class Iterator>
+class ListIterator final {
+ public:
+   // C++17 friendly std::iterator implementation
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type = T;
+  using difference_type = std::ptrdiff_t;
+  using pointer = T*;
+  using reference = ListElementReference<T, Iterator>;
+  explicit ListIterator() = default;
+  ~ListIterator() = default;
+  ListIterator(const ListIterator&) = default;
+  ListIterator(ListIterator&&) noexcept = default;
+  ListIterator& operator=(const ListIterator&) = default;
+  ListIterator& operator=(ListIterator&&) noexcept = default;
+  ListIterator& operator++() {
+      ++iterator_;
+      return *this;
+  }
+  ListIterator operator++(int) {
+      ListIterator copy(*this);
+      ++*this;
+      return copy;
+  }
+  ListIterator& operator--() {
+      --iterator_;
+      return *this;
+  }
+  ListIterator operator--(int) {
+      ListIterator copy(*this);
+      --*this;
+      return copy;
+  }
+  ListIterator& operator+=(typename List<T>::size_type offset) {
+      iterator_ += offset;
+      return *this;
+  }
+  ListIterator& operator-=(typename List<T>::size_type offset) {
+      iterator_ -= offset;
+      return *this;
+  }
+  ListIterator operator+(typename List<T>::size_type offset) const {
+    return ListIterator{iterator_ + offset};
+  }
+  ListIterator operator-(typename List<T>::size_type offset) const {
+    return ListIterator{iterator_ - offset};
+  }
+  friend difference_type operator-(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ - rhs.iterator_;
+  }
+  ListElementReference<T, Iterator> operator*() const {
+    return {iterator_};
+  }
+  ListElementReference<T, Iterator> operator[](typename List<T>::size_type offset) const {
+    return {iterator_ + offset};
+  }
+private:
+  explicit ListIterator(Iterator iterator): iterator_(std::move(iterator)) {}
+  Iterator iterator_;
+  friend bool operator==(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ == rhs.iterator_;
+  }
+  friend bool operator!=(const ListIterator& lhs, const ListIterator& rhs) {
+    return !(lhs == rhs);
+  }
+  friend bool operator<(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ < rhs.iterator_;
+  }
+  friend bool operator<=(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ <= rhs.iterator_;
+  }
+  friend bool operator>(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ > rhs.iterator_;
+  }
+  friend bool operator>=(const ListIterator& lhs, const ListIterator& rhs) {
+    return lhs.iterator_ >= rhs.iterator_;
+  }
+  friend class ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  friend class List<T>;
+};
+template<class T> List<T> toTypedList(List<IValue> list);
+template<class T> List<IValue> toList(List<T>&& list);
+template<class T> List<IValue> toList(const List<T>& list);
+const IValue* ptr_to_first_element(const List<IValue>& list);
+}
+/**
+ * An object of this class stores a list of values of type T.
+ *
+ * This is a pointer type. After a copy, both Lists
+ * will share the same storage:
+ *
+ * > List<int> a;
+ * > List<int> b = a;
+ * > b.push_back("three");
+ * > ASSERT("three" == a.get(0));
+ *
+ * We use this class in the PyTorch kernel API instead of
+ * std::vector<T>, because that allows us to do optimizations
+ * and switch out the underlying list implementation without
+ * breaking backwards compatibility for the kernel API.
+ */
+template<class T>
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+class List final {
+private:
+  // This is an intrusive_ptr because List is a pointer type.
+  // Invariant: This will never be a nullptr, there will always be a valid
+  // ListImpl.
+  c10::intrusive_ptr<c10::detail::ListImpl> impl_;
+  using internal_reference_type = impl::ListElementReference<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using internal_const_reference_type = typename impl::ListElementConstReferenceTraits<T>::const_reference;
+public:
+  using value_type = T;
+  using size_type = typename c10::detail::ListImpl::list_type::size_type;
+  using iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using const_iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::iterator>;
+  using reverse_iterator = impl::ListIterator<T, typename c10::detail::ListImpl::list_type::reverse_iterator>;
+  /**
+   * Constructs an empty list.
+   */
+  explicit List();
+  /**
+   * Constructs a list with some initial values.
+   * Example:
+   *   List<int> a({2, 3, 4});
+   */
+  List(std::initializer_list<T> initial_values);
+  explicit List(ArrayRef<T> initial_values);
+  /**
+   * Create a generic list with runtime type information.
+   * This only works for c10::impl::GenericList and is not part of the public API
+   * but only supposed to be used internally by PyTorch.
+   */
+  explicit List(TypePtr elementType);
+  List(const List&) = default;
+  List& operator=(const List&) = default;
+  ~List() = default;
+  /**
+   * Create a new List pointing to a deep copy of the same data.
+   * The List returned is a new list with separate storage.
+   * Changes in it are not reflected in the original list or vice versa.
+   */
+  List copy() const;
+  /**
+   * Returns the element at specified location pos, with bounds checking.
+   * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
+   */
+  internal_const_reference_type get(size_type pos) const;
+  /**
+   * Moves out the element at the specified location pos and returns it, with bounds checking.
+   * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
+   * The list contains an invalid element at position pos afterwards. Any operations
+   * on it before re-setting it are invalid.
+   */
+  value_type extract(size_type pos) const;
+  /**
+   * Returns a reference to the element at specified location pos, with bounds checking.
+   * If pos is not within the range of the container, an exception of type std::out_of_range is thrown.
+   *
+   * You cannot store the reference, but you can read it and assign new values to it:
+   *
+   *   List<int64_t> list = ...;
+   *   list[2] = 5;
+   *   int64_t v = list[1];
+   */
+  internal_const_reference_type operator[](size_type pos) const;
+  internal_reference_type operator[](size_type pos);
+  /**
+   * Assigns a new value to the element at location pos.
+   */
+  void set(size_type pos, const value_type& value) const;
+  /**
+   * Assigns a new value to the element at location pos.
+   */
+  void set(size_type pos, value_type&& value) const;
+  /**
+   * Returns an iterator to the first element of the container.
+   * If the container is empty, the returned iterator will be equal to end().
+   */
+  iterator begin() const;
+  /**
+   * Returns an iterator to the element following the last element of the container.
+   * This element acts as a placeholder; attempting to access it results in undefined behavior.
+   */
+  iterator end() const;
+  /**
+   * Checks if the container has no elements.
+   */
+  bool empty() const;
+  /**
+   * Returns the number of elements in the container
+   */
+  size_type size() const;
+  /**
+   * Increase the capacity of the vector to a value that's greater or equal to new_cap.
+   */
+  void reserve(size_type new_cap) const;
+  /**
+   * Erases all elements from the container. After this call, size() returns zero.
+   * Invalidates any references, pointers, or iterators referring to contained elements. Any past-the-end iterators are also invalidated.
+   */
+  void clear() const;
+  /**
+   * Inserts value before pos.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator insert(iterator pos, const T& value) const;
+  /**
+   * Inserts value before pos.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator insert(iterator pos, T&& value) const;
+  /**
+   * Inserts a new element into the container directly before pos.
+   * The new element is constructed with the given arguments.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  template<class... Args>
+  iterator emplace(iterator pos, Args&&... value) const;
+  /**
+   * Appends the given element value to the end of the container.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void push_back(const T& value) const;
+  /**
+   * Appends the given element value to the end of the container.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void push_back(T&& value) const;
+  /**
+   * Appends the given list to the end of the container. Uses at most one memory allocation.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void append(List<T> lst) const;
+  /**
+   * Appends the given element value to the end of the container.
+   * The new element is constructed with the given arguments.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  template<class... Args>
+  void emplace_back(Args&&... args) const;
+  /**
+   * Removes the element at pos.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator erase(iterator pos) const;
+  /**
+   * Removes the elements in the range [first, last).
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  iterator erase(iterator first, iterator last) const;
+  /**
+   * Removes the last element of the container.
+   * Calling pop_back on an empty container is undefined.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void pop_back() const;
+  /**
+   * Resizes the container to contain count elements.
+   * If the current size is less than count, additional default-inserted elements are appended.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void resize(size_type count) const;
+  /**
+   * Resizes the container to contain count elements.
+   * If the current size is less than count, additional copies of value are appended.
+   * May invalidate any references, pointers, or iterators referring to contained elements. Any past-the-end iterators may also be invalidated.
+   */
+  void resize(size_type count, const T& value) const;
+  /**
+   * Value equality comparison. This function implements Python-like semantics for
+   * equality: two lists with the same identity (e.g. same pointer) trivially
+   * compare equal, otherwise each element is compared for equality.
+   */
+  template <class T_>
+  friend bool operator==(const List<T_>& lhs, const List<T_>& rhs);
+  template <class T_>
+  friend bool operator!=(const List<T_>& lhs, const List<T_>& rhs);
+  /**
+   * Identity comparison. Returns true if and only if `rhs` represents the same
+   * List object as `this`.
+   */
+  bool is(const List<T>& rhs) const;
+  std::vector<T> vec() const;
+  /**
+   * Returns the number of Lists currently pointing to this same list.
+   * If this is the only instance pointing to this list, returns 1.
+   */
+  // TODO Test use_count
+  size_t use_count() const;
+  TypePtr elementType() const;
+  // See [unsafe set type] for why this exists.
+  void unsafeSetElementType(TypePtr t);
+private:
+  explicit List(c10::intrusive_ptr<c10::detail::ListImpl>&& elements);
+  explicit List(const c10::intrusive_ptr<c10::detail::ListImpl>& elements);
+  friend struct IValue;
+  template<class T_> friend List<T_> impl::toTypedList(List<IValue>);
+  template<class T_> friend List<IValue> impl::toList(List<T_>&&);
+  template<class T_> friend List<IValue> impl::toList(const List<T_>&);
+  friend const IValue* impl::ptr_to_first_element(const List<IValue>& list);
+};
+namespace impl {
+// GenericList is how IValue stores lists. It is, however, not part of the
+// public API. Kernels should use Lists with concrete types instead
+// (maybe except for some internal prim ops).
+using GenericList = List<IValue>;
+}
+}
+namespace torch {
+  template<class T> using List = c10::List<T>;
+}
+#include <ATen/core/List_inl.h>  // IWYU pragma: keep

.venv/lib/python3.12/site-packages/torch/include/ATen/core/List_inl.h ADDED Viewed

	@@ -0,0 +1,353 @@

+#pragma once
+#include <ATen/core/jit_type_base.h>
+#include <ATen/core/ivalue.h>
+namespace c10 {
+template<class T> decltype(auto) getTypePtr();
+std::string toString(const Type& type);
+template<class T>
+List<T>::List(c10::intrusive_ptr<c10::detail::ListImpl>&& elements)
+: impl_(std::move(elements)) {}
+template<class T>
+List<T>::List(const c10::intrusive_ptr<c10::detail::ListImpl>& elements)
+: impl_(elements) {}
+template<class T>
+List<T>::List()
+: List(make_intrusive<c10::detail::ListImpl>(
+  typename c10::detail::ListImpl::list_type(),
+  getTypePtr<T>())) {
+  static_assert(!std::is_same_v<T, IValue>, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType) instead.");
+}
+template<class T>
+List<T>::List(ArrayRef<T> values)
+: List(make_intrusive<c10::detail::ListImpl>(
+    typename c10::detail::ListImpl::list_type(),
+    getTypePtr<T>())) {
+  static_assert(!std::is_same_v<T, IValue>, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
+  impl_->list.reserve(values.size());
+  for (const T& element : values) {
+    impl_->list.push_back(element);
+  }
+}
+template<class T>
+List<T>::List(std::initializer_list<T> initial_values)
+: List(ArrayRef<T>(initial_values)) {
+  static_assert(!std::is_same_v<T, IValue>, "This constructor is not valid for List<IValue>. Please use c10::impl::GenericList(elementType).");
+}
+template<class T>
+List<T>::List(TypePtr elementType)
+: List(make_intrusive<c10::detail::ListImpl>(
+    typename c10::detail::ListImpl::list_type(),
+    std::move(elementType))) {
+  static_assert(std::is_same_v<T, IValue> || std::is_same_v<T, c10::intrusive_ptr<ivalue::Future>>,
+                "This constructor is only valid for c10::impl::GenericList or List<Future>.");
+}
+namespace impl {
+template<class T>
+List<T> toTypedList(impl::GenericList list) {
+  // If there's other instances of the list (i.e. list.use_count() > 1), then we have to be invariant
+  // because upcasting would allow people to add types into the new list that would break the old list.
+  // However, if there aren't any other instances of this list (i.e. list.use_count() == 1), then we can
+  // allow upcasting. This can be a perf improvement since we can cast List<T> to List<optional<T>>
+  // without having to copy it. This is also used to provide backwards compatibility with some old models
+  // that serialized the index arguments to aten::index, aten::index_put, aten::index_put_ and aten::index_put_impl_
+  // as List<Tensor> before we changed that argument to be List<optional<Tensor>>. When deserializing, we
+  // have list.use_count() == 1 and can deserialize the List<Tensor> directly as List<optional<Tensor>>.
+  TORCH_CHECK(*list.impl_->elementType == *getTypePtr<T>()
+    || (list.use_count() == 1 && list.impl_->elementType->isSubtypeOf(*getTypePtr<T>()))
+    , "Tried to cast a List<", toString(*list.impl_->elementType), "> to a List<", toString(*getTypePtr<T>()), ">. Types mismatch.");
+  return List<T>(std::move(list.impl_));
+}
+template<class T>
+impl::GenericList toList(List<T>&& list) {
+  return GenericList(std::move(list.impl_));
+}
+template<class T>
+impl::GenericList toList(const List<T>& list) {
+  return GenericList(list.impl_);
+}
+}
+template<class T>
+List<T> List<T>::copy() const {
+  return List<T>(impl_->copy());
+}
+namespace detail {
+  template<class T>
+  T list_element_to(T element) {
+    return element;
+  }
+  template<class T>
+  T list_element_to(const IValue& element) {
+    return element.template to<T>();
+  }
+  template<class T>
+  T list_element_to(IValue&& element) {
+    return std::move(element).template to<T>();
+  }
+  template<class T>
+  struct ListElementFrom {
+    static IValue from(const T& element) {
+      return element;
+    }
+    static IValue from(T&& element) {
+      return std::move(element);
+    }
+  };
+  template<>
+  struct ListElementFrom<IValue> {
+    static const IValue& from(const IValue& element) {
+      return element;
+    }
+    static IValue&& from(IValue&& element) {
+      return std::move(element);
+    }
+  };
+}
+namespace impl {
+template <class T, class Iterator>
+ListElementReference<T, Iterator>::operator std::conditional_t<
+    std::is_reference_v<typename c10::detail::ivalue_to_const_ref_overload_return<
+        T>::type>,
+    const T&,
+    T>() const {
+  return iterator_->template to<T>();
+}
+template<class T, class Iterator>
+ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(T&& new_value) && {
+  *iterator_ = c10::detail::ListElementFrom<T>::from(std::move(new_value));
+  return *this;
+}
+template<class T, class Iterator>
+ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(const T& new_value) && {
+  *iterator_ = c10::detail::ListElementFrom<T>::from(new_value);
+  return *this;
+}
+template<class T, class Iterator>
+ListElementReference<T, Iterator>& ListElementReference<T, Iterator>::operator=(ListElementReference<T, Iterator>&& rhs) && noexcept {
+  *iterator_ = *rhs.iterator_;
+  return *this;
+}
+template<class T, class Iterator>
+void swap(ListElementReference<T, Iterator>&& lhs, ListElementReference<T, Iterator>&& rhs)  noexcept {
+  std::swap(*lhs.iterator_, *rhs.iterator_);
+}
+template<class T, class Iterator>
+bool operator==(const ListElementReference<T, Iterator>& lhs, const T& rhs) {
+  const T& lhs_tmp = lhs;
+  return lhs_tmp == rhs;
+}
+template<class T, class Iterator>
+inline bool operator==(const T& lhs, const ListElementReference<T, Iterator>& rhs) {
+  return rhs == lhs;
+}
+template<class T>
+inline typename ListElementConstReferenceTraits<T>::const_reference
+list_element_to_const_ref(const IValue& element) {
+  return element.template to<T>();
+}
+template<>
+inline typename ListElementConstReferenceTraits<std::optional<std::string>>::const_reference
+list_element_to_const_ref<std::optional<std::string>>(const IValue& element) {
+  return element.toOptionalStringRef();
+}
+} // namespace impl
+template<class T>
+void List<T>::set(size_type pos, const value_type& value) const {
+  impl_->list.at(pos) = c10::detail::ListElementFrom<T>::from(value);
+}
+template<class T>
+void List<T>::set(size_type pos, value_type&& value) const {
+  impl_->list.at(pos) = c10::detail::ListElementFrom<T>::from(std::move(value));
+}
+template<class T>
+typename List<T>::internal_const_reference_type List<T>::get(size_type pos) const {
+  return operator[](pos);
+}
+template<class T>
+typename List<T>::internal_const_reference_type List<T>::operator[](size_type pos) const {
+  return c10::impl::list_element_to_const_ref<T>(impl_->list.at(pos));
+}
+template<class T>
+typename List<T>::internal_reference_type List<T>::operator[](size_type pos) {
+  static_cast<void>(impl_->list.at(pos)); // Throw the exception if it is out of range.
+  return {impl_->list.begin() + static_cast<typename decltype(impl_->list)::difference_type>(pos)};
+}
+template<class T>
+typename List<T>::value_type List<T>::extract(size_type pos) const {
+  auto& elem = impl_->list.at(pos);
+  auto result = c10::detail::list_element_to<T>(std::move(elem));
+  // Reset the list element to a T() instead of None to keep it correctly typed
+  elem = c10::detail::ListElementFrom<T>::from(T{});
+  return result;
+}
+template<class T>
+typename List<T>::iterator List<T>::begin() const {
+  return iterator(impl_->list.begin());
+}
+template<class T>
+typename List<T>::iterator List<T>::end() const {
+  return iterator(impl_->list.end());
+}
+template<class T>
+bool List<T>::empty() const {
+  return impl_->list.empty();
+}
+template<class T>
+typename List<T>::size_type List<T>::size() const {
+  return impl_->list.size();
+}
+template<class T>
+void List<T>::reserve(size_type new_cap) const {
+  impl_->list.reserve(new_cap);
+}
+template<class T>
+void List<T>::clear() const {
+  impl_->list.clear();
+}
+template<class T>
+typename List<T>::iterator List<T>::insert(iterator pos, const T& value) const {
+  return iterator { impl_->list.insert(pos.iterator_, c10::detail::ListElementFrom<T>::from(value)) };
+}
+template<class T>
+typename List<T>::iterator List<T>::insert(iterator pos, T&& value) const {
+  return iterator { impl_->list.insert(pos.iterator_, c10::detail::ListElementFrom<T>::from(std::move(value))) };
+}
+template<class T>
+template<class... Args>
+typename List<T>::iterator List<T>::emplace(iterator pos, Args&&... value) const {
+  // TODO Use list_element_from?
+  return iterator { impl_->list.emplace(pos.iterator_, std::forward<Args>(value)...) };
+}
+template<class T>
+void List<T>::push_back(const T& value) const {
+  impl_->list.push_back(c10::detail::ListElementFrom<T>::from(value));
+}
+template<class T>
+void List<T>::push_back(T&& value) const {
+  impl_->list.push_back(c10::detail::ListElementFrom<T>::from(std::move(value)));
+}
+template<class T>
+void List<T>::append(List<T> b) const {
+  if (b.use_count() == 1) {
+    impl_->list.insert(impl_->list.end(), make_move_iterator(b.impl_->list.begin()), make_move_iterator(b.impl_->list.end()));
+  } else {
+    impl_->list.insert(impl_->list.end(), b.impl_->list.begin(), b.impl_->list.end());
+  }
+}
+template<class T>
+template<class... Args>
+void List<T>::emplace_back(Args&&... args) const {
+  // TODO Use list_element_from?
+  impl_->list.push_back(T(std::forward<Args>(args)...));
+}
+template<class T>
+typename List<T>::iterator List<T>::erase(iterator pos) const {
+  return iterator { impl_->list.erase(pos.iterator_) };
+}
+template<class T>
+typename List<T>::iterator List<T>::erase(iterator first, iterator last) const {
+  return iterator { impl_->list.erase(first.iterator_, last.iterator_) };
+}
+template<class T>
+void List<T>::pop_back() const {
+  impl_->list.pop_back();
+}
+template<class T>
+void List<T>::resize(size_type count) const {
+  impl_->list.resize(count, T{});
+}
+template<class T>
+void List<T>::resize(size_type count, const T& value) const {
+  impl_->list.resize(count, value);
+}
+template<class T>
+bool operator==(const List<T>& lhs, const List<T>& rhs) {
+  // Lists with the same identity trivially compare equal.
+  if (lhs.impl_ == rhs.impl_) {
+    return true;
+  }
+  // Otherwise, just compare values directly.
+  return *lhs.impl_ == *rhs.impl_;
+}
+template<class T>
+bool operator!=(const List<T>& lhs, const List<T>& rhs) {
+  return !(lhs == rhs);
+}
+template<class T>
+bool List<T>::is(const List<T>& rhs) const {
+  return this->impl_ == rhs.impl_;
+}
+template<class T>
+std::vector<T> List<T>::vec() const {
+  std::vector<T> result(begin(), end());
+  return result;
+}
+template<class T>
+size_t List<T>::use_count() const {
+  return impl_.use_count();
+}
+template <class T>
+TypePtr List<T>::elementType() const {
+  return impl_->elementType;
+}
+template <class T>
+void List<T>::unsafeSetElementType(TypePtr t) {
+  impl_->elementType = std::move(t);
+}
+}

.venv/lib/python3.12/site-packages/torch/include/ATen/core/MT19937RNGEngine.h ADDED Viewed

	@@ -0,0 +1,194 @@

+#pragma once
+#include <c10/util/irange.h>
+// define constants like M_PI and C keywords for MSVC
+#ifdef _MSC_VER
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#endif
+#include <array>
+#include <cmath>
+#include <cstdint>
+namespace at {
+constexpr int MERSENNE_STATE_N = 624;
+constexpr int MERSENNE_STATE_M = 397;
+constexpr uint32_t MATRIX_A = 0x9908b0df;
+constexpr uint32_t UMASK = 0x80000000;
+constexpr uint32_t LMASK = 0x7fffffff;
+/**
+ * Note [Mt19937 Engine implementation]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Originally implemented in:
+ * http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/MTARCOK/mt19937ar-cok.c
+ * and modified with C++ constructs. Moreover the state array of the engine
+ * has been modified to hold 32 bit uints instead of 64 bits.
+ *
+ * Note that we reimplemented mt19937 instead of using std::mt19937 because,
+ * at::mt19937 turns out to be faster in the pytorch codebase. PyTorch builds with -O2
+ * by default and following are the benchmark numbers (benchmark code can be found at
+ * https://github.com/syed-ahmed/benchmark-rngs):
+ *
+ * with -O2
+ * Time to get 100000000 philox randoms with at::uniform_real_distribution = 0.462759s
+ * Time to get 100000000 at::mt19937 randoms with at::uniform_real_distribution = 0.39628s
+ * Time to get 100000000 std::mt19937 randoms with std::uniform_real_distribution = 0.352087s
+ * Time to get 100000000 std::mt19937 randoms with at::uniform_real_distribution = 0.419454s
+ *
+ * std::mt19937 is faster when used in conjunction with std::uniform_real_distribution,
+ * however we can't use std::uniform_real_distribution because of this bug:
+ * http://open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#2524. Plus, even if we used
+ * std::uniform_real_distribution and filtered out the 1's, it is a different algorithm
+ * than what's in pytorch currently and that messes up the tests in tests_distributions.py.
+ * The other option, using std::mt19937 with at::uniform_real_distribution is a tad bit slower
+ * than at::mt19937 with at::uniform_real_distribution and hence, we went with the latter.
+ *
+ * Copyright notice:
+ * A C-program for MT19937, with initialization improved 2002/2/10.
+ * Coded by Takuji Nishimura and Makoto Matsumoto.
+ * This is a faster version by taking Shawn Cokus's optimization,
+ * Matthe Bellew's simplification, Isaku Wada's real version.
+ *
+ * Before using, initialize the state by using init_genrand(seed)
+ * or init_by_array(init_key, key_length).
+ *
+ * Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ *
+ *   3. The names of its contributors may not be used to endorse or promote
+ *   products derived from this software without specific prior written
+ *   permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * Any feedback is very welcome.
+ * http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ * email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+ */
+/**
+ * mt19937_data_pod is used to get POD data in and out
+ * of mt19937_engine. Used in torch.get_rng_state and
+ * torch.set_rng_state functions.
+ */
+struct mt19937_data_pod {
+  uint64_t seed_;
+  int left_;
+  bool seeded_;
+  uint32_t next_;
+  std::array<uint32_t, MERSENNE_STATE_N> state_;
+};
+class mt19937_engine {
+public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  inline explicit mt19937_engine(uint64_t seed = 5489) {
+    init_with_uint32(seed);
+  }
+  inline mt19937_data_pod data() const {
+    return data_;
+  }
+  inline void set_data(const mt19937_data_pod& data) {
+    data_ = data;
+  }
+  inline uint64_t seed() const {
+    return data_.seed_;
+  }
+  inline bool is_valid() {
+    if ((data_.seeded_ == true)
+      && (data_.left_ > 0 && data_.left_ <= MERSENNE_STATE_N)
+      && (data_.next_ <= MERSENNE_STATE_N)) {
+      return true;
+    }
+    return false;
+  }
+  inline uint32_t operator()() {
+    if (--(data_.left_) == 0) {
+        next_state();
+    }
+    uint32_t y = *(data_.state_.data() + data_.next_++);
+    y ^= (y >> 11);
+    y ^= (y << 7) & 0x9d2c5680;
+    y ^= (y << 15) & 0xefc60000;
+    y ^= (y >> 18);
+    return y;
+  }
+private:
+  mt19937_data_pod data_;
+  inline void init_with_uint32(uint64_t seed) {
+    data_.seed_ = seed;
+    data_.seeded_ = true;
+    data_.state_[0] = seed & 0xffffffff;
+    for (const auto j : c10::irange(1, MERSENNE_STATE_N)) {
+      data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j);
+    }
+    data_.left_ = 1;
+    data_.next_ = 0;
+  }
+  inline uint32_t mix_bits(uint32_t u, uint32_t v) {
+    return (u & UMASK) | (v & LMASK);
+  }
+  inline uint32_t twist(uint32_t u, uint32_t v) {
+    return (mix_bits(u,v) >> 1) ^ (v & 1 ? MATRIX_A : 0);
+  }
+  inline void next_state() {
+    uint32_t* p = data_.state_.data();
+    data_.left_ = MERSENNE_STATE_N;
+    data_.next_ = 0;
+    for(int j = MERSENNE_STATE_N - MERSENNE_STATE_M + 1; --j; p++) {
+      *p = p[MERSENNE_STATE_M] ^ twist(p[0], p[1]);
+    }
+    for(int j = MERSENNE_STATE_M; --j; p++) {
+      *p = p[MERSENNE_STATE_M - MERSENNE_STATE_N] ^ twist(p[0], p[1]);
+    }
+    *p = p[MERSENNE_STATE_M - MERSENNE_STATE_N] ^ twist(p[0], data_.state_[0]);
+  }
+};
+typedef mt19937_engine mt19937;
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/NamedTensor.h ADDED Viewed

	@@ -0,0 +1,143 @@

+#pragma once
+#include <ATen/core/Dimname.h>
+#include <c10/core/TensorImpl.h>
+namespace at {
+class TensorBase;
+// XXX: This file exists because TensorImpl is in c10, but Dimname is in ATen.
+// Due to the c10/ATen library split, TensorImpl cannot depend on Dimname,
+// so we have a couple of workarounds.
+//
+// In the long term, we'll move Dimname to c10 and everything in this file
+// can be refactored out. The main blocker for that is that "c10::Symbol"
+// actually exists outside of c10 and needs to be moved in.
+// TensorImpl has a unique_ptr<NamedTensorMetaInterface> field.
+// XXX: Ideally we would just put std::optional<vector<Dimname>> into TensorImpl.
+//
+// This class has an important invariant: there must be at least ONE
+// non-wildcard
+struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface {
+  // This enum is to remind people that the invariant on constructors is that
+  // the list of dimnames must have at least one non-wildcard
+  enum HAS_NON_WILDCARD {
+    HasNonWildcard
+  };
+  explicit NamedTensorMeta(HAS_NON_WILDCARD, DimnameList names)
+    : names_(names.vec()) {
+    check_invariants();
+  }
+  explicit NamedTensorMeta(HAS_NON_WILDCARD, std::vector<Dimname>&& names)
+    : names_(std::move(names)) {
+    check_invariants();
+  }
+  std::unique_ptr<c10::NamedTensorMetaInterface> clone() const override {
+    return std::make_unique<NamedTensorMeta>(HasNonWildcard, names_);
+  }
+  DimnameList names() const { return names_; }
+  // Used for an assertion in TensorImpl.h
+  int64_t slow_dim() const override {
+    return static_cast<int64_t>(names_.size());
+  }
+  void check_invariants() const {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      std::any_of(names_.begin(), names_.end(), [](const Dimname& n) { return !n.isWildcard(); }));
+  }
+  void set_names(HAS_NON_WILDCARD, DimnameList new_names) {
+    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
+    std::copy(new_names.begin(), new_names.end(), names_.begin());
+    check_invariants();
+  }
+  void set_names(HAS_NON_WILDCARD, std::vector<Dimname>&& new_names) {
+    TORCH_INTERNAL_ASSERT(new_names.size() == names_.size());
+    names_ = std::move(new_names);
+    check_invariants();
+  }
+  // INVARIANT: at least one Dimname is non-WILDCARD
+  std::vector<Dimname> names_;
+};
+// When NamesMode is disabled, then all operations ignore tensors' names fields.
+// Concretely speaking, all tensors are treated as having nullopt names.
+struct TORCH_API NamesMode {
+  static bool is_enabled();
+  static void set_enabled(bool enabled);
+};
+// A RAII, thread local (!) guard that enables or disables names upon
+// construction, and sets it back to the original value upon destruction.
+struct TORCH_API NoNamesGuard {
+  NoNamesGuard() : prev_mode(NamesMode::is_enabled()) {
+    NamesMode::set_enabled(false);
+  }
+  NoNamesGuard(const NoNamesGuard&) = delete;
+  NoNamesGuard(NoNamesGuard&&) = delete;
+  NoNamesGuard& operator=(const NoNamesGuard&) = delete;
+  NoNamesGuard& operator=(NoNamesGuard&&) = delete;
+  ~NoNamesGuard() {
+    if (initialized) {
+      reset();
+    }
+  }
+  void reset() {
+    TORCH_INTERNAL_ASSERT(initialized);
+    NamesMode::set_enabled(prev_mode);
+  }
+ private:
+  bool prev_mode;
+  bool initialized{true};
+};
+void check_names_valid_for(const TensorBase& tensor, DimnameList names);
+void check_names_valid_for(size_t tensor_dim, DimnameList names);
+// Sets the names of `tensor` to be `names`.
+TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, std::optional<DimnameList> names);
+TORCH_API const TensorBase& internal_set_names_inplace(const TensorBase& tensor, std::vector<Dimname>&& names, bool validate_names);
+constexpr size_t kMaxNamedTensorDim = 64;
+DimnameList default_names(size_t len);
+namespace impl {
+// Some helper functions on TensorImpl. Useful for working with names in TH.
+// XXX: Ideally these would exist as methods on TensorImpl
+TORCH_API void internal_set_names_inplace(TensorImpl* impl, std::optional<DimnameList> names, bool validate_names);
+TORCH_API void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names, bool validate_names);
+void check_names_valid_for(TensorImpl* impl, DimnameList names);
+// Returns true if the tensor's names exist and are not all 'None'.
+// Returns false if the tensor's names don't exist (were not allocated),
+// or if all names are 'None'.
+// We treat not-allocated-names the same as allocated names that are all 'None'.
+TORCH_API bool has_names(const TensorImpl* impl);
+// Returns the names of the tensor's dimensions.
+// Unnamed tensors are treated as having 'None' in all dimension; this method
+// would return a DimnameList of all 'None's for an unnamed tensor.
+TORCH_API DimnameList get_names(const TensorImpl* impl);
+// This is more of an implementation detail; one should use impl::get_names /
+// Tensor::names() whenever possible because it provides a cleaner API.
+// Returns the names of the tensor if they have been allocated; returns nullopt
+// instead if the haven't been. The names of a tensor are not allocated if a
+// tensor is constructed with names=None.
+TORCH_API std::optional<DimnameList> get_opt_names(const TensorImpl* impl);
+} // namespace impl
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/NestedIntSymNodeImpl.h ADDED Viewed

	@@ -0,0 +1,187 @@

+#pragma once
+#include <c10/core/ConstantSymNodeImpl.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Exception.h>
+#include <c10/util/intrusive_ptr.h>
+#include <cstdint>
+#include <optional>
+#include <string>
+namespace c10 {
+// The motivating usecase for this is to represent the ragged size structure
+// of a jagged tensor [B, [s_0, s_1, s_2], D] as a single integer j0. This
+// allows us to simply return [B, j0, D] if someone queries for the size of our
+// tensor.
+//
+// Morally we define comparison between two nested ints to return true if
+// that comparison holds for all corresponding elements of the arrays they
+// represent. Comparison between a nested int and a plain int is defined
+// similarly.
+//
+// To simulate this desired behavior but also avoid the O(N) cost of checking,
+// we associate each raggedness pattern with an integer "id" that can be used as
+// a proxy to evaluate equality. We also constrain the range of values for this
+// as to enable inequality checks.
+//
+// We also support a positive integer scalar "coeff" that is used for computing
+// strides. For example given, a [B, j0, D] tensor, it can be strided in two
+// different ways: [D * j0, D, 1] and [j0, 1, sum(j0)]. The coeff is used to
+// differentiate the two cases.
+//
+// During tracing the strides of the outputs need to be a function of the size
+// and strides of the inputs so it is important that NestedIntSymNode itself is
+// able to express this.
+class TORCH_API NestedIntSymNodeImpl : public SymNodeImpl {
+ public:
+  // CAUTION: you should probably not be constructing these directly; please
+  // the higher-level API in python instead (TODO: actually introduce that).
+  explicit NestedIntSymNodeImpl(int64_t val, int64_t coeff)
+      : val_(val), coeff_(coeff) {}
+  bool bool_() override {
+    return false;
+  }
+  bool is_int() override {
+    return true;
+  }
+  bool is_float() override {
+    return false;
+  }
+  bool is_bool() override {
+    return false;
+  }
+  bool is_nested_int() const override {
+    return true;
+  }
+  bool has_hint() override {
+    return true;
+  }
+  c10::SymNode wrap_int(int64_t num) override {
+    return SymNode(c10::make_intrusive<ConstantSymNodeImpl<int64_t>>(num));
+  }
+  int64_t guard_int(const char* file, int64_t line) override {
+    TORCH_CHECK(false);
+  }
+  double guard_float(const char* file, int64_t line) override {
+    TORCH_CHECK(false, "not a float");
+  }
+  bool guard_bool(const char* file, int64_t line) override {
+    TORCH_CHECK(false, "not a bool");
+  }
+  int64_t int_() override {
+    TORCH_CHECK(false);
+  }
+  std::string str() override {
+    if (coeff_ == 1) {
+      return "j" + std::to_string(val_);
+    }
+    return std::to_string(coeff_) + "*j" + std::to_string(val_);
+  }
+  // NOTE [ Inequalities with nested int ]
+  //
+  // The semantics of nested int when it comes to relations is that it is
+  // treated as integer known to be within a certain range,
+  //
+  //     j0 \in [2, int64_t::max]
+  //
+  // allowing us to answer queries like j0 >= 1 (True), and j0 == 0 (False).
+  // This is a useful default range for the raggedness pattern of a jagged
+  // tensor (1) since sizes are non-negative, and (2) we need to get past 0/1
+  // specialization checks.
+  //
+  // [ Indeterminate inequalities error out ]
+  //
+  // Given the semantic defined above, certain relations like j0 < 3 are thus
+  // indeterminable. In our impl today, evaluating such relations error
+  //
+  // It may seem convenient to just define indeterminate relations to return
+  // False, but the implementation we maintain in parallel using sympy does not
+  // allow this.
+  //
+  // Sympy only allows overriding of Ge. The other relations (Lt, Gt, Le) are,
+  // by consequence, all derived from Ge e.g., Lt(a, b) := !Ge(a, b). This
+  // would mean that means that if we define the indeterminate j0 >= 3 to be
+  // False, the also indeterminate j0 < 3 will be evaluated to be True!
+  //
+  // [ Coefficient are assumed positive ]
+  //
+  // For the purpose of computing inequalities, we consider the coefficient of
+  // the nested int to be a positive integer.
+  //
+  // Thus, no modifications are needed to the logic since
+  // j0 >= k implies coeff * j0 >= k
+  //
+  c10::SymNode eq(const c10::SymNode& other) override;
+  c10::SymNode ne(const c10::SymNode& other) override;
+  c10::SymNode ge(const c10::SymNode& other) override;
+  c10::SymNode gt(const c10::SymNode& other) override;
+  c10::SymNode lt(const c10::SymNode& other) override;
+  c10::SymNode le(const c10::SymNode& other) override;
+  c10::SymNode mul(const c10::SymNode& other) override;
+  std::optional<int64_t> nested_int() override {
+    return val_;
+  }
+  std::optional<int64_t> nested_int_coeff() override {
+    return coeff_;
+  }
+  bool is_symbolic() override {
+    return false;
+  }
+  c10::SymNode clone() override;
+#define DEFINE_BINARY_NOT_SUPPORTED(name)                           \
+  c10::SymNode name(const c10::SymNode& other) override {           \
+    TORCH_CHECK(false, #name " not supported by NestedIntSymNode"); \
+  }
+  DEFINE_BINARY_NOT_SUPPORTED(add)
+  DEFINE_BINARY_NOT_SUPPORTED(sub)
+  DEFINE_BINARY_NOT_SUPPORTED(truediv)
+  DEFINE_BINARY_NOT_SUPPORTED(pow)
+  DEFINE_BINARY_NOT_SUPPORTED(floordiv)
+  DEFINE_BINARY_NOT_SUPPORTED(mod)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_min)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_max)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_and)
+  DEFINE_BINARY_NOT_SUPPORTED(sym_or)
+#undef DEFINE_BINARY_NOT_SUPPORTED
+#define DEFINE_NOT_SUPPORTED(name)                                     \
+  c10::SymNode name() override {                                       \
+    TORCH_CHECK(false, #name " is not supported by NestedIntSymNode"); \
+  }
+  DEFINE_NOT_SUPPORTED(sym_not)
+  DEFINE_NOT_SUPPORTED(ceil)
+  DEFINE_NOT_SUPPORTED(floor)
+  DEFINE_NOT_SUPPORTED(neg)
+  DEFINE_NOT_SUPPORTED(sym_float)
+#undef DEFINE_NOT_SUPPORTED
+ private:
+  int64_t val_;
+  int64_t coeff_;
+};
+} // namespace c10

.venv/lib/python3.12/site-packages/torch/include/ATen/core/PhiloxRNGEngine.h ADDED Viewed

	@@ -0,0 +1,240 @@

+#pragma once
+// define constants like M_PI and C keywords for MSVC
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#include <math.h>
+#endif
+#ifdef __CUDACC__
+#include <cuda.h>
+#endif
+#include <array>
+#include <c10/macros/Macros.h>
+#include <cmath>
+#include <cstdint>
+namespace at {
+// typedefs for holding vector data
+namespace detail {
+typedef std::array<uint32_t, 4> UINT4;
+typedef std::array<uint32_t, 2> UINT2;
+typedef std::array<double, 2> DOUBLE2;
+typedef std::array<float, 2> FLOAT2;
+} // namespace detail
+/**
+ * Note [Philox Engine implementation]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Originally implemented in PyTorch's fusion compiler
+ * Refer to: http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+ * for details regarding the engine.
+ *
+ * Note that currently this implementation of the philox engine is not used
+ * anywhere except for tests in cpu_generator_test.cpp. However, this engine
+ * will replace curandStatePhilox4_32_10_t in the future.
+ *
+ * The philox engine takes a seed value, a subsequeunce
+ * for starting the generation and an offset for the subsequence.
+ * Think of this engine as an algorithm producing a huge array. We are
+ * parallelizing this array by partitioning the huge array and assigning
+ * a thread index to each partition. In other words, each seed value
+ * (there are 2^64 possible seed values) gives a sub array of size
+ * 2^128 (each element in that array is a 128 bit number). Reasoning
+ * behind the array being of size 2^128 is, there are 2^64 possible
+ * thread index value and there is an array of size 2^64 for each of
+ * those thread index. Hence 2^64 * 2^64 = 2^128 for each seed value.
+ *
+ * In short, this generator can produce 2^64 (seed values) * 2^128 (number
+ * of elements in an array given by a seed value) = 2^192 values.
+ *
+ * Arguments:
+ * seed:        Seed values could be any number from 0 to 2^64-1.
+ * subsequence: Subsequence is just the cuda thread indexing with:
+ *              - blockIdx.x * blockDim.x + threadIdx.x
+ * offset:      The offset variable in PhiloxEngine  decides how many 128-bit
+ *              random numbers to skip (i.e. how many groups of 4, 32-bit numbers to skip)
+ *              and hence really decides the total number of randoms that can be achieved
+ *              for the given subsequence.
+ */
+class philox_engine {
+public:
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  C10_HOST_DEVICE inline explicit philox_engine(uint64_t seed = 67280421310721,
+                                 uint64_t subsequence = 0,
+                                 uint64_t offset = 0) {
+    reset_state(seed, subsequence);
+    incr_n(offset);
+  }
+  C10_HOST_DEVICE inline void reset_state(uint64_t seed = 67280421310721,
+                                 uint64_t subsequence = 0) {
+    key_[0] = static_cast<uint32_t>(seed);
+    key_[1] = static_cast<uint32_t>(seed >> 32);
+    counter_ = detail::UINT4{};
+    counter_[2] = static_cast<uint32_t>(subsequence);
+    counter_[3] = static_cast<uint32_t>(subsequence >> 32);
+    STATE = 0;
+  }
+  /**
+   * Set the offset field of Philox Generator to the desired offset.
+   */
+  C10_HOST_DEVICE inline void set_offset(uint64_t offset) {
+    counter_[0] = static_cast<uint32_t>(offset);
+    counter_[1] = static_cast<uint32_t>(offset >> 32);
+  }
+  /**
+   * Gets the current offset of the Philox Generator.
+   */
+  C10_HOST_DEVICE uint64_t get_offset() const {
+    uint64_t lo = static_cast<uint64_t>(counter_[0]);
+    uint64_t hi = static_cast<uint64_t>(counter_[1]) << 32;
+    return lo | hi;
+  }
+  /**
+   * Produces a unique 32-bit pseudo random number on every invocation. Bookeeps state to avoid waste.
+   */
+  C10_HOST_DEVICE inline uint32_t operator()(int32_t n_rounds = 10) { // 10 here to preserve back-compat behavior
+    if(STATE == 0) {
+      detail::UINT4 counter = counter_;
+      detail::UINT2 key = key_;
+      output_ = rand(counter, key, n_rounds);
+      incr();
+    }
+    uint32_t ret = output_[static_cast<int>(STATE)];
+    STATE = (STATE + 1) & 3;
+    return ret;
+  }
+  inline float randn(uint32_t n_rounds) {
+    #ifdef __CUDA_ARCH__
+    AT_ASSERT(false, "Unsupported invocation of randn on CUDA");
+    #endif
+    if(STATE == 0) {
+      detail::UINT4 counter = counter_;
+      detail::UINT2 key = key_;
+      output_ = rand(counter, key, n_rounds);
+      incr();
+    }
+    // TODO(min-jean-cho) change to Polar method, a more efficient version of Box-Muller method
+    // TODO(voz) We use std:: below, and thus need a separate impl for CUDA.
+    float u1 = 1 - uint32_to_uniform_float(output_[0]); // uint32_to_uniform_float returns [0,1), we need (0,1] to avoid passing 0 to log.
+    float u2 = 1 - uint32_to_uniform_float(output_[1]);
+    return static_cast<float>(std::sqrt(-2.0 * std::log(u1)) * std::cos(2.0 * M_PI * u2));
+  }
+  /**
+   * Function that Skips N 128 bit numbers in a subsequence
+   */
+  C10_HOST_DEVICE inline void incr_n(uint64_t n) {
+    uint32_t nlo = static_cast<uint32_t>(n);
+    uint32_t nhi = static_cast<uint32_t>(n >> 32);
+    counter_[0] += nlo;
+    // if overflow in x has occurred, carry over to nhi
+    if (counter_[0] < nlo) {
+      nhi++;
+      // if overflow in nhi has occurred during carry over,
+      // propagate that overflow to y and exit to increment z
+      // otherwise return
+      counter_[1] += nhi;
+      if(nhi != 0) {
+        if (nhi <= counter_[1]) {
+          return;
+        }
+      }
+    } else {
+      // if overflow in y has occurred during addition,
+      // exit to increment z
+      // otherwise return
+      counter_[1] += nhi;
+      if (nhi <= counter_[1]) {
+        return;
+      }
+    }
+    if (++counter_[2])
+      return;
+    ++counter_[3];
+  }
+  /**
+   * Function that Skips one 128 bit number in a subsequence
+   */
+  C10_HOST_DEVICE inline void incr() {
+    if (++counter_[0])
+      return;
+    if (++counter_[1])
+      return;
+    if (++counter_[2]) {
+      return;
+    }
+    ++counter_[3];
+  }
+private:
+  detail::UINT4 counter_;
+  detail::UINT4 output_;
+  detail::UINT2 key_;
+  uint32_t STATE;
+  C10_HOST_DEVICE inline uint32_t mulhilo32(uint32_t a, uint32_t b,
+                                    uint32_t *result_high) {
+    #ifdef __CUDA_ARCH__
+      *result_high = __umulhi(a, b);
+      return a*b;
+    #else
+      const uint64_t product = static_cast<uint64_t>(a) * b;
+      *result_high = static_cast<uint32_t>(product >> 32);
+      return static_cast<uint32_t>(product);
+    #endif
+  }
+  C10_HOST_DEVICE inline detail::UINT4 single_round(detail::UINT4 ctr, detail::UINT2 in_key) {
+    uint32_t hi0 = 0;
+    uint32_t hi1 = 0;
+    uint32_t lo0 = mulhilo32(kPhiloxSA, ctr[0], &hi0);
+    uint32_t lo1 = mulhilo32(kPhiloxSB, ctr[2], &hi1);
+    detail::UINT4 ret;
+    ret[0] = hi1 ^ ctr[1] ^ in_key[0];
+    ret[1] = lo1;
+    ret[2] = hi0 ^ ctr[3] ^ in_key[1];
+    ret[3] = lo0;
+    return ret;
+  }
+  C10_HOST_DEVICE constexpr float uint32_to_uniform_float(uint32_t value) {
+      // maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
+      constexpr float scale = 4.6566127342e-10;
+      return static_cast<float>(value & 0x7FFFFFFF) * scale;
+  }
+  C10_HOST_DEVICE inline detail::UINT4 rand(detail::UINT4& counter, detail::UINT2& key, uint32_t n_rounds) {
+    for (uint32_t round = 0; round < (n_rounds - 1); round++) {
+        counter = single_round(counter, key);
+        key[0] += (kPhilox10A); key[1] += (kPhilox10B);
+      }
+    return single_round(counter, key);
+  }
+  static const uint32_t kPhilox10A = 0x9E3779B9;
+  static const uint32_t kPhilox10B = 0xBB67AE85;
+  static const uint32_t kPhiloxSA = 0xD2511F53;
+  static const uint32_t kPhiloxSB = 0xCD9E8D57;
+};
+typedef philox_engine Philox4_32;
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/PythonFallbackKernel.h ADDED Viewed

	@@ -0,0 +1,35 @@

+#pragma once
+#include <ATen/core/TorchDispatchUtils.h>
+namespace at::impl {
+struct TORCH_API RestorePythonTLSSnapshot {
+  RestorePythonTLSSnapshot();
+  RestorePythonTLSSnapshot(RestorePythonTLSSnapshot&& other) = delete;
+  RestorePythonTLSSnapshot(const RestorePythonTLSSnapshot&) = delete;
+  RestorePythonTLSSnapshot& operator=(const RestorePythonTLSSnapshot&) = delete;
+  RestorePythonTLSSnapshot& operator=(RestorePythonTLSSnapshot&&) = delete;
+  ~RestorePythonTLSSnapshot();
+private:
+  c10::impl::LocalDispatchKeySet saved_;
+  c10::impl::ForceDispatchKeyGuard guard_;
+};
+// RAII guard to make working with the above TLS safer.
+struct TORCH_API MaybeSetTLSOnEntryGuard {
+public:
+  MaybeSetTLSOnEntryGuard();
+  MaybeSetTLSOnEntryGuard(MaybeSetTLSOnEntryGuard&& other) = delete;
+  MaybeSetTLSOnEntryGuard(const MaybeSetTLSOnEntryGuard&) = delete;
+  MaybeSetTLSOnEntryGuard& operator=(const MaybeSetTLSOnEntryGuard&) = delete;
+  MaybeSetTLSOnEntryGuard& operator=(MaybeSetTLSOnEntryGuard&&) = delete;
+  ~MaybeSetTLSOnEntryGuard();
+private:
+  bool value_set_;
+};
+} // namespace at::impl

.venv/lib/python3.12/site-packages/torch/include/ATen/core/PythonOpRegistrationTrampoline.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+#include <ATen/core/dispatch/Dispatcher.h>
+// TODO: this can probably live in c10
+namespace at::impl {
+class TORCH_API PythonOpRegistrationTrampoline final {
+  static std::atomic<c10::impl::PyInterpreter*> interpreter_;
+public:
+  //  Returns true if you successfully registered yourself (that means
+  //  you are in the hot seat for doing the operator registrations!)
+  static bool registerInterpreter(c10::impl::PyInterpreter*);
+  // Returns nullptr if no interpreter has been registered yet.
+  static c10::impl::PyInterpreter* getInterpreter();
+};
+} // namespace at::impl

.venv/lib/python3.12/site-packages/torch/include/ATen/core/QuantizerBase.h ADDED Viewed

	@@ -0,0 +1,84 @@

+#pragma once
+#include <c10/core/ScalarType.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/intrusive_ptr.h>
+namespace at {
+class Tensor;
+struct QTensorImpl;
+struct Quantizer;
+using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
+using QuantizerPtr = c10::intrusive_ptr<Quantizer>;
+/**
+ * Quantizer is the class for storing all the information
+ * that's necessary to perform quantize and dequantize
+ * operation.
+ *
+ * We might have different types of quantization schemes and this is
+ * the base class for all quantizers.
+ *
+ * QTensorImpl will hold a pointer to Quantizer so that we can support
+ * different quantization schemes on Tensor.
+ *
+ * For example, the most common quantization scheme, Affine Quantization,
+ * requires scale and zero_point as parameters, we'll store scale and zero_point
+ * inside the instance and we can use it to quantize a float Tensor or
+ * dequantize a quantized Tensor.
+ *
+ * When you add new types of leaf Quantizer class, please also
+ * make sure to add a corresponding QScheme enum since
+ * they should have one to one mapping.
+ *
+ * Note about intrusive_ptr:
+ * Quantized Tensor holds an intrusive_ptr to Quantizer, and multiple Tensor can
+ * share the same Quantizer. Quantizer should be immutable.
+ */
+struct TORCH_API Quantizer : public c10::intrusive_ptr_target {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const ScalarType scalar_type_;
+  explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {}
+  ~Quantizer() override = default;
+  // Copied from torch/csrc/jit/ir/scope.h
+  QuantizerPtr intrusive_from_this() {
+    c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
+                                           // from a raw `this` pointer
+                                           // so we need to bump the refcount
+                                           // to account for this ownership
+    return c10::intrusive_ptr<Quantizer>::reclaim(this);
+  }
+  /**
+   * Each concrete Quantizer type should have a unique QScheme type.
+   */
+  virtual QScheme qscheme() const = 0;
+  ScalarType scalar_type() const {
+    return scalar_type_;
+  }
+  /**
+   * quantize a float Tensor into a quantized Tensor.
+   */
+  virtual Tensor quantize(const Tensor& t) = 0;
+  /**
+   * dequantize a quantized Tensor into a float Tensor.
+   */
+  virtual Tensor dequantize(const Tensor& t) = 0;
+  /**
+   * dequantize a quantized Tensor into a float Tensor, out= variant
+   */
+  virtual Tensor& dequantize_out(Tensor& out, const Tensor& t) = 0;
+  /**
+   * Compare against `other` for equality.
+   */
+  virtual bool equalTo(QuantizerPtr other) const = 0;
+};
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Range.h ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+#include <cstdint>
+#include <iosfwd>
+namespace at {
+struct Range {
+  Range(int64_t begin, int64_t end)
+    : begin(begin)
+    , end(end) {}
+  int64_t size() const { return end - begin; }
+  Range operator/(int64_t divisor) {
+    return Range(begin / divisor, end / divisor);
+  }
+  int64_t begin;
+  int64_t end;
+};
+std::ostream& operator<<(std::ostream& out, const Range& range);
+}  // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Reduction.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#pragma once
+namespace at::Reduction {
+// NB: Keep this in sync with Reduction class in torch/nn/_reduction.py
+// These constants control the reduction behavior of loss functions.
+// Ideally, this would be a scoped enum, but jit doesn't support that
+enum Reduction {
+  None, // Do not reduce
+  Mean, // (Possibly weighted) mean of losses
+  Sum, // Sum losses
+  END
+};
+} // namespace at::Reduction

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Scalar.h ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include <c10/core/Scalar.h>

.venv/lib/python3.12/site-packages/torch/include/ATen/core/ScalarType.h ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include <c10/core/ScalarType.h>

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Tensor.h ADDED Viewed

	@@ -0,0 +1,98 @@

+#pragma once
+#include <ATen/core/TensorBody.h>
+#include <c10/util/Exception.h>
+namespace at {
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+class TORCH_API OptionalTensorRef {
+ public:
+  OptionalTensorRef() = default;
+  ~OptionalTensorRef() {
+    ref_.unsafeReleaseTensorImpl();
+  }
+  OptionalTensorRef(const TensorBase& src)
+      : ref_(Tensor::unsafe_borrow_t{}, src) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src.defined());
+  }
+  OptionalTensorRef(const OptionalTensorRef& rhs)
+      : ref_(Tensor::unsafe_borrow_t{}, rhs.ref_) {}
+  OptionalTensorRef(OptionalTensorRef&& rhs) = default;
+  OptionalTensorRef& operator=(OptionalTensorRef rhs) {
+    std::swap(ref_, rhs.ref_);
+    return *this;
+  }
+  bool has_value() const {
+    return ref_.defined();
+  }
+  const Tensor& getTensorRef() const & {
+    return ref_;
+  }
+  const Tensor& operator*() const & {
+    return ref_;
+  }
+  const Tensor* operator->() const & {
+    return &ref_;
+  }
+  operator bool() const {
+    return ref_.defined();
+  }
+ private:
+  Tensor ref_;
+};
+// Use to convert a TensorBase (that may be undefined) to an at::Tensor
+// without bumping refcount.
+class TORCH_API TensorRef {
+ public:
+  ~TensorRef() {
+    ref_.unsafeReleaseTensorImpl();
+  }
+  TensorRef(const TensorBase& src)
+      : ref_(Tensor::unsafe_borrow_t{}, src) {}
+  TensorRef(TensorRef&& other) = default;
+  TensorRef(const TensorRef&) = default;
+  TensorRef& operator=(const TensorRef&) = default;
+  TensorRef& operator=(TensorRef&&) = default;
+  const Tensor& operator*() const & {
+    return ref_;
+  }
+ private:
+  Tensor ref_;
+};
+template <typename T>
+auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_void_t<T> {
+  // Return the grad argument in case of a hook with void return type to have an
+  // std::function with Tensor return type
+  static_assert(std::is_same_v<decltype(hook(Tensor())), void>,
+                "Expected hook to return void");
+  return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad_base) {
+    TensorRef grad(grad_base);
+    fn(*grad);
+    return Tensor();
+  });
+}
+template <typename T>
+auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_var_t<T> {
+  return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad_base) {
+    TensorRef grad(grad_base);
+    Tensor ret = fn(*grad);
+    return TensorBase(std::move(ret));
+  });
+}
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/TensorAccessor.h ADDED Viewed

	@@ -0,0 +1,275 @@

+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+namespace at {
+// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
+// is used to enable the __restrict__ keyword/modifier for the data
+// passed to cuda.
+template <typename T>
+struct DefaultPtrTraits {
+  typedef T* PtrType;
+};
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T>
+struct RestrictPtrTraits {
+  typedef T* __restrict__ PtrType;
+};
+#endif
+// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
+// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
+// to functions and types available there (e.g. IntArrayRef isn't).
+// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class TensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST_DEVICE TensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_), sizes_(sizes_), strides_(strides_) {}
+  C10_HOST IntArrayRef sizes() const {
+    return IntArrayRef(sizes_,N);
+  }
+  C10_HOST IntArrayRef strides() const {
+    return IntArrayRef(strides_,N);
+  }
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+protected:
+  PtrType data_;
+  const index_t* sizes_;
+  const index_t* strides_;
+};
+// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
+// `Tensor.accessor<T, N>()`.
+// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
+// indexing on the device uses `TensorAccessor`s.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}
+  C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+  C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
+  }
+};
+template<typename T, template <typename U> class PtrTraits, typename index_t>
+class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST_DEVICE TensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
+  C10_HOST_DEVICE T & operator[](index_t i) {
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    return this->data_[this->strides_[0]*i];
+  }
+  C10_HOST_DEVICE const T & operator[](index_t i) const {
+    return this->data_[this->strides_[0]*i];
+  }
+};
+// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
+// and as
+// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
+// in order to transfer them on the device when calling kernels.
+// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
+// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
+// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
+// on the device, so those functions are host only.
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class GenericPackedTensorAccessorBase {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : data_(data_) {
+    std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
+    std::copy(strides_, strides_ + N, std::begin(this->strides_));
+  }
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessorBase(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : data_(data_) {
+    for (const auto i : c10::irange(N)) {
+      this->sizes_[i] = sizes_[i];
+      this->strides_[i] = strides_[i];
+    }
+  }
+  C10_HOST_DEVICE index_t stride(index_t i) const {
+    return strides_[i];
+  }
+  C10_HOST_DEVICE index_t size(index_t i) const {
+    return sizes_[i];
+  }
+  C10_HOST_DEVICE PtrType data() {
+    return data_;
+  }
+  C10_HOST_DEVICE const PtrType data() const {
+    return data_;
+  }
+protected:
+  PtrType data_;
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t sizes_[N];
+  // NOLINTNEXTLINE(*c-arrays*)
+  index_t strides_[N];
+  C10_HOST void bounds_check_(index_t i) const {
+    TORCH_CHECK_INDEX(
+        0 <= i && i < index_t{N},
+        "Index ",
+        i,
+        " is not within bounds of a tensor of dimension ",
+        N);
+  }
+};
+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
+  C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
+    index_t* new_sizes = this->sizes_ + 1;
+    index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+  C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
+    const index_t* new_sizes = this->sizes_ + 1;
+    const index_t* new_strides = this->strides_ + 1;
+    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
+  }
+  /// Returns a PackedTensorAccessor of the same dimension after transposing the
+  /// two dimensions given. Does not actually move elements; transposition is
+  /// made by permuting the size/stride arrays. If the dimensions are not valid,
+  /// asserts.
+  C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
+        this->data_, this->sizes_, this->strides_);
+    std::swap(result.strides_[dim1], result.strides_[dim2]);
+    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
+    return result;
+  }
+};
+template<typename T, template <typename U> class PtrTraits, typename index_t>
+class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
+public:
+  typedef typename PtrTraits<T>::PtrType PtrType;
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const index_t* sizes_,
+      const index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
+  // if index_t is not int64_t, we want to have an int64_t constructor
+  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
+  C10_HOST GenericPackedTensorAccessor(
+      PtrType data_,
+      const source_index_t* sizes_,
+      const source_index_t* strides_)
+      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
+  C10_DEVICE T & operator[](index_t i) {
+    return this->data_[this->strides_[0] * i];
+  }
+  C10_DEVICE const T& operator[](index_t i) const {
+    return this->data_[this->strides_[0]*i];
+  }
+  // Same as in the general N-dimensional case, but note that in the
+  // 1-dimensional case the returned PackedTensorAccessor will always be an
+  // identical copy of the original
+  C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
+      index_t dim1,
+      index_t dim2) const {
+    this->bounds_check_(dim1);
+    this->bounds_check_(dim2);
+    return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
+        this->data_, this->sizes_, this->strides_);
+  }
+};
+// Can't put this directly into the macro function args because of commas
+#define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
+// Old name for `GenericPackedTensorAccessor`
+template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+C10_DEFINE_DEPRECATED_USING(PackedTensorAccessor, AT_X)
+#undef AT_X
+template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+using PackedTensorAccessor32 = GenericPackedTensorAccessor<T, N, PtrTraits, int32_t>;
+template <typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+using PackedTensorAccessor64 = GenericPackedTensorAccessor<T, N, PtrTraits, int64_t>;
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/TensorBase.h ADDED Viewed

	@@ -0,0 +1,1056 @@

+#pragma once
+#include <c10/core/Device.h>
+#include <c10/core/Layout.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <c10/core/Storage.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/util/C++17.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ExclusivelyOwned.h>
+#include <c10/util/ExclusivelyOwnedTensorTraits.h>
+#include <c10/util/MaybeOwned.h>
+#include <optional>
+#include <c10/util/intrusive_ptr.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/QuantizerBase.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/StorageUtils.h>
+namespace c10 {
+class Scalar;
+}
+namespace torch::autograd {
+struct Node;
+} // namespace torch::autograd
+namespace at {
+class Tensor;
+class TensorBase;
+// Convert Tensor to TensorBase without any need to include Tensor.h
+TORCH_API const TensorBase& get_tensor_base(const Tensor& t);
+namespace impl {
+inline bool variable_excluded_from_dispatch() {
+#ifdef C10_MOBILE
+  // Please read the comment in `VariableFallbackKernel.cpp` about the background of this change.
+  return true;
+#else
+  return c10::impl::tls_local_dispatch_key_set().excluded_.isSupersetOf(c10::autograd_dispatch_keyset);
+#endif
+}
+}
+// NOTE: [Tensor vs. TensorBase]
+//
+// Tensor, being the central data structure in PyTorch, gets used and
+// its header included almost everywhere. Unfortunately this means
+// every time an operator signature is updated or changed in
+// native_functions.yaml, you (and every other PyTorch developer) need
+// to recompile all of ATen and its dependencies.
+//
+// TensorBase aims to break up these header dependencies, and improve
+// incremental build times for all PyTorch developers. TensorBase
+// represents a reference counted handle to TensorImpl, exactly the
+// same as Tensor. However, TensorBase doesn't have code generated
+// methods in its API and thus no dependence on native_functions.yaml.
+//
+// Usage tips
+// ----------
+// - You can `#define TORCH_ASSERT_NO_OPERATORS` at the top of a .cpp
+//   or .cu file to ensure it has no header dependencies on
+//   native_functions.yaml (direct or indirect).
+// - Tensor inherits from TensorBase, so functions taking
+//   `const TensorBase &` are callable with Tensor as well.
+// - TensorBase can be converted to Tensor with `Tensor(tensor_base)`,
+//   but this requires a reference-count bump. OptionalTensorRef, on
+//   the other hand, can materialize a `const Tensor &` without
+//   touching the reference-count.
+class TORCH_API TensorBase {
+ public:
+  struct unsafe_borrow_t { explicit unsafe_borrow_t() = default; };
+ protected:
+  // Create a Tensor with a +0 reference count. Special care must be
+  // taken to avoid decrementing this reference count at destruction
+  // time. Intended to support MaybeOwnedTraits<Tensor>.
+  explicit TensorBase(unsafe_borrow_t, const TensorBase& rhs)
+      : impl_(c10::intrusive_ptr<at::TensorImpl, UndefinedTensorImpl>(rhs.impl_.get(), c10::raw::DontIncreaseRefcount{})) {}
+  friend MaybeOwnedTraits<TensorBase>;
+ public:
+  TensorBase() = default;
+  // This constructor should not be used by end users and is an implementation
+  // detail invoked by autogenerated code.
+  explicit TensorBase(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
+      : impl_(std::move(tensor_impl)) {
+    if (impl_.get() == nullptr) {
+      throw std::runtime_error("TensorImpl with nullptr is not supported");
+    }
+  }
+  TensorBase(const TensorBase&) = default;
+  TensorBase(TensorBase&&) noexcept = default;
+  ~TensorBase() noexcept = default;
+ public:
+  // Creates a new wrapper from TensorImpl. Intentionally a free method because
+  // it should be used with care. Checks necessary invariants
+  static TensorBase wrap_tensor_impl(
+      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl) {
+    TensorBase r(std::move(tensor_impl));
+    r.enforce_invariants();
+    return r;
+  }
+  int64_t dim() const {
+    return impl_->dim();
+  }
+  int64_t storage_offset() const {
+    return impl_->storage_offset();
+  }
+  TensorBase contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const {
+    if (is_contiguous(memory_format)) {
+      return *this;
+    } else {
+      return __dispatch_contiguous(memory_format);
+    }
+  }
+  /// Should be used if *this can reasonably be expected to be contiguous and
+  /// performance is important.
+  /// Compared to contiguous, it saves a reference count
+  /// increment/decrement if *this is already contiguous, at the cost
+  /// in all cases of an extra pointer of stack usage, an extra branch
+  /// to access, and an extra branch at destruction time.
+  c10::MaybeOwned<TensorBase> expect_contiguous(
+      MemoryFormat memory_format=MemoryFormat::Contiguous) const &;
+  // Use .contiguous() instead. Trying to borrow from a prvalue
+  // will only lead to trouble and dangling references.
+  c10::MaybeOwned<TensorBase> expect_contiguous(
+      MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
+  const TensorBase& fill_(const c10::Scalar& scalar) const;
+  const TensorBase& zero_() const;
+  TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, std::optional<at::MemoryFormat> memory_format=std::nullopt) const;
+  bool is_complex() const {
+    return at::isComplexType(this->scalar_type());
+  }
+  bool is_floating_point() const {
+    return at::isFloatingType(this->scalar_type());
+  }
+  bool is_signed() const {
+    return at::isSignedType(this->scalar_type());
+  }
+  c10::SymInt sym_size(int64_t dim) const {
+    return impl_->sym_size(dim);
+  }
+  c10::SymInt sym_stride(int64_t dim) const {
+    const auto sizes = this->sym_strides();
+    const auto ndim = static_cast<int64_t>(sizes.size());
+    // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
+    return sizes[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)];
+  }
+  int64_t size(int64_t dim) const {
+    return impl_->size(dim);
+  }
+  int64_t stride(int64_t dim) const {
+    const auto strides = this->strides();
+    const auto ndim = static_cast<int64_t>(strides.size());
+    // false is passed to maybe_wrap_dim so behavior is identical to array access (but with wrapping)
+    return strides[c10::maybe_wrap_dim(dim, ndim, /*wrap_scalar=*/false)];
+  }
+  TensorImpl * unsafeGetTensorImpl() const {
+    return impl_.get();
+  }
+  TensorImpl * unsafeReleaseTensorImpl() {
+    return impl_.release();
+  }
+  const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
+    return impl_;
+  }
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> unsafeReleaseIntrusivePtr() {
+    return std::move(impl_);
+  }
+  bool defined() const {
+    return impl_;
+  }
+  void reset() {
+    impl_.reset();
+  }
+#if defined (_MSC_VER)
+  TensorBase& operator=(const TensorBase& x) & {
+    impl_ = x.impl_;
+    return *this;
+  };
+  TensorBase& operator=(TensorBase&& x) & noexcept {
+    impl_ = std::move(x.impl_);
+    return *this;
+  }
+#else
+  TensorBase& operator=(const TensorBase& x) & = default;
+  TensorBase& operator=(TensorBase&& x) & noexcept = default;
+#endif
+  // Ban assignment to rvalues, since at::Tensor (weirdly) performs a deep copy here
+  TensorBase& operator=(const TensorBase&) && = delete;
+  TensorBase& operator=(TensorBase&&) && noexcept = delete;
+  bool is_same(const TensorBase& other) const noexcept {
+    return impl_ == other.impl_;
+  }
+  size_t use_count() const noexcept {
+    return impl_.use_count();
+  }
+  size_t weak_use_count() const noexcept {
+    return impl_.weak_use_count();
+  }
+  std::string toString() const;
+  IntArrayRef sizes() const {
+    return impl_->sizes();
+  }
+  c10::SymIntArrayRef sym_sizes() const {
+    return impl_->sym_sizes();
+  }
+  c10::SymIntArrayRef sym_strides() const {
+    return impl_->sym_strides();
+  }
+  IntArrayRef strides() const {
+    return impl_->strides();
+  }
+  // See impl::get_opt_names in ATen/NamedTensor.h for docs.
+  std::optional<DimnameList> opt_names() const {
+    return impl::get_opt_names(unsafeGetTensorImpl());
+  }
+  // See impl::get_names in ATen/NamedTensor.h for docs.
+  DimnameList names() const {
+    return impl::get_names(unsafeGetTensorImpl());
+  }
+  int64_t ndimension() const {
+    return dim();
+  }
+  bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const {
+    return impl_->is_contiguous(memory_format);
+  }
+  bool is_non_overlapping_and_dense() const {
+    return impl_->is_non_overlapping_and_dense();
+  }
+  at::MemoryFormat suggest_memory_format(
+      bool channels_last_strides_exact_match = false) const {
+    // Setting channels_last_strides_exact_match to true forces function to
+    // check 0,1 - sized dimension strides.
+    if (layout() == at::kStrided) {
+      if (impl_->is_strides_like_channels_last()) {
+        if (!channels_last_strides_exact_match ||
+            get_channels_last_strides_2d(sizes()) == strides()) {
+          return at::MemoryFormat::ChannelsLast;
+        }
+      }
+      else if (impl_->is_strides_like_channels_last_3d()) {
+        if (!channels_last_strides_exact_match ||
+            get_channels_last_strides_3d(sizes()) == strides()) {
+          return at::MemoryFormat::ChannelsLast3d;
+        }
+      }
+    }
+    return at::MemoryFormat::Contiguous;
+  }
+  // Total bytes consumed by the "view" of elements of the array.  Does not
+  // include size of metadata.  The number reported here does not necessarily
+  // correspond to the true physical memory consumed by a tensor; instead,
+  // it reports the memory the tensor would take *if* it were contiguous.
+  // Defined to be numel() * itemsize()
+  size_t nbytes() const {
+    TORCH_CHECK(layout () != at::kSparse,
+                "nbytes is not defined for sparse tensors.  If you want the size of the constituent " \
+                "tensors, add the nbytes of the indices and values.  If you want the size of the  " \
+                "equivalent dense tensor, multiply numel() by element_size()");
+    return impl_->numel() * impl_->itemsize();
+  }
+  c10::SymInt sym_nbytes() const {
+    TORCH_CHECK(layout () != at::kSparse,
+                "nbytes is not defined for sparse tensors.  If you want the size of the constituent " \
+                "tensors, add the nbytes of the indices and values.  If you want the size of the  " \
+                "equivalent dense tensor, multiply numel() by element_size()");
+    return impl_->sym_numel() * impl_->itemsize();
+  }
+  int64_t numel() const {
+    return impl_->numel();
+  }
+  c10::SymInt sym_numel() const {
+    return impl_->sym_numel();
+  }
+  c10::SymInt sym_storage_offset() const {
+    return impl_->sym_storage_offset();
+  }
+  // Length of one array element in bytes.  This is the traditional
+  // Numpy naming.
+  size_t itemsize() const {
+    return impl_->itemsize();
+  }
+  // Same as itemsize().  This is the PyTorch naming.
+  int64_t element_size() const {
+    return static_cast<int64_t>(impl_->itemsize());
+  }
+  DispatchKeySet key_set() const {
+    return impl_->key_set();
+  }
+  ScalarType scalar_type() const {
+    return typeMetaToScalarType(impl_->dtype());
+  }
+  bool has_storage() const {
+    return defined() && impl_->has_storage();
+  }
+  const Storage& storage() const {
+    return impl_->storage();
+  }
+  bool is_alias_of(const at::TensorBase& other) const{
+    return impl_->storage().is_alias_of(other.storage());
+  }
+  // Move the storage backend to shm based
+  // to enable memory sharing across processes.
+  //
+  // NB1: the ideal behavior of this API still requires further discussion
+  // but for now we are inclined to keep it consistent with existing THP behavior
+  // https://github.com/pytorch/pytorch/blob/4dca9bde0552afc67b5b74f4a0696fe6055709c4/torch/storage.py#L196-L212
+  // so we don't assert on anything here and rely on caller knowing
+  // what it's doing.
+  //
+  // NB2: this currently provides Linux fd based shm support only
+  // to simplify the storage lifetime management logic in ATen
+  // and similarly for now we are not adding support for file system based
+  // shm support like in THP due to additional GC manager support needed
+  // to prevent leaks.
+  // As such, calling this from non supported systems (e.g. Windows) would fail.
+  void share_memory_() {
+    at::share_memory_(*this);
+  }
+  inline bool _is_zerotensor() const {
+    return impl_->_is_zerotensor();
+  }
+  inline void _set_zero(bool zero) const {
+    impl_->_set_zero(zero);
+  }
+  inline bool is_conj() const {
+    return impl_->is_conj();
+  }
+  // sets the conjugate bit of a tensor.
+  // NOTE: Conjugate bit is supposed to be a read-only field. Only change this, if you are sure
+  // that's what you want. Changing this might lead to incorrect behavior since conjugation is
+  // a lazy operation and we rely on this bit to determine if a conjugation needs to be materialized.
+  inline void _set_conj(bool conjugate) const {
+    impl_->_set_conj(conjugate);
+  }
+  inline bool is_neg() const {
+    return impl_->is_neg();
+  }
+  // sets the negative bit of a tensor.
+  // NOTE: Negative bit is supposed to be a read-only field. Only change this, if you are sure
+  // that's what you want. Changing this might lead to incorrect behavior since we rely on this
+  // bit to determine if a negation needs to be materialized.
+  inline void _set_neg(bool negative) const {
+    impl_->_set_neg(negative);
+  }
+  /// Returns a `Tensor`'s layout.
+  Layout layout() const {
+    return impl_->layout();
+  }
+  /// Returns a `Tensor`'s dtype (`TypeMeta`).
+  caffe2::TypeMeta dtype() const {
+    return impl_->dtype();
+  }
+  /// Returns a `Tensor`'s device.
+  inline Device device() const {
+    return impl_->device();
+  }
+  /// Returns a `Tensor`'s device index.
+  DeviceIndex get_device() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->get_device();
+  }
+  /// Returns if a `Tensor` has CPU backend.
+  bool is_cpu() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_cpu();
+  }
+  /// Returns if a `Tensor` has CUDA backend.
+  bool is_cuda() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_cuda();
+  }
+  /// Returns if a `Tensor` has IPU backend.
+  bool is_ipu() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_ipu();
+  }
+  /// Returns if a `Tensor` has XPU backend.
+  bool is_xpu() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_xpu();
+  }
+  /// Returns if a `Tensor` has XLA backend.
+  bool is_xla() const {
+    return impl_->is_xla();
+  }
+  /// Returns if a `Tensor` has MTIA backend.
+  bool is_mtia() const {
+    return impl_->is_mtia();
+  }
+  /// Returns if a `Tensor` has HPU backend.
+  bool is_hpu() const {
+    return impl_->is_hpu();
+  }
+  /// Returns if a `Tensor` has Lazy backend.
+  bool is_lazy() const {
+    return impl_->is_lazy();
+  }
+  /// Returns if a `Tensor` has HIP backend.
+  bool is_hip() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_hip();
+  }
+  /// Returns if a `Tensor` has VE backend.
+  bool is_ve() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_ve();
+  }
+  /// Returns if a `Tensor` has PrivateUse1 backend.
+  bool is_privateuseone() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_privateuseone();
+  }
+  /// Returns if a `Tensor` has sparse backend.
+  bool is_sparse() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_sparse();
+  }
+  /// Returns is a `Tensor` has a sparse CSR backend.
+  bool is_sparse_csr() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_sparse_csr();
+  }
+  /// Returns if a `Tensor` is mkldnn tensor.
+  bool is_mkldnn() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_mkldnn();
+  }
+  /// Returns if a `Tensor` is mps tensor.
+  bool is_mps() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_mps();
+  }
+  /// Returns if a `Tensor` is maia tensor.
+  bool is_maia() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_maia();
+  }
+  /// Returns if a `Tensor` is vulkan tensor.
+  bool is_vulkan() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_vulkan();
+  }
+  /// Returns if a `Tensor` is metal tensor.
+  bool is_metal() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_metal();
+  }
+  /// Returns if a `Tensor` has quantized backend.
+  bool is_quantized() const {
+    // NB: this is not a native function to avoid dispatching overhead.
+    return impl_->is_quantized();
+  }
+  /// Returns if a `Tensor` is a meta tensor.  Meta tensors can
+  /// also have other designations.
+  bool is_meta() const {
+    return impl_->is_meta();
+  }
+  /// Returns if a `Tensor` is an inference tensor.
+  bool is_inference() const {
+    return impl_->is_inference();
+  }
+  // Returns if a `Tensor` is a NestedTensor.
+  bool is_nested() const {
+    return impl_->is_nested();
+  }
+  /// If a tensor is a quantized tensor, returns its quantizer
+  /// TODO: it's not in native_functions.yaml yet as it's not exposed to python
+  QuantizerPtr quantizer() const;
+  /// Returns if a `Tensor` has any dimension names
+  bool has_names() const {
+    // If a user is using unnamed tensors, then we can short-circuit right here.
+    // Otherwise, impl::has_names attempts to retrieve names.
+    if (!impl_->has_named_tensor_meta()) {
+      return false;
+    }
+    return impl::has_names(unsafeGetTensorImpl());
+  }
+  /// Returns a `Tensor`'s dimension names data structure
+  const NamedTensorMeta* get_named_tensor_meta() const {
+    return static_cast<NamedTensorMeta*>(impl_->named_tensor_meta());
+  }
+  NamedTensorMeta* get_named_tensor_meta() {
+    return static_cast<NamedTensorMeta*>(impl_->named_tensor_meta());
+  }
+  /// Returns the `TensorOptions` corresponding to this `Tensor`. Defined in
+  /// TensorOptions.h.
+  TensorOptions options() const {
+    return TensorOptions().dtype(dtype())
+                          .device(device())
+                          .layout(layout());
+  }
+  const void* const_data_ptr() const {
+    return this->unsafeGetTensorImpl()->data();
+  }
+  void* mutable_data_ptr() const {
+    return this->unsafeGetTensorImpl()->mutable_data();
+  }
+  // TODO(#97856) Make this return a const pointer. This currently
+  //              returns a non-const pointer because of the large
+  //              number of clients that we still want to audit before
+  //              migrating to mutable_data_ptr().
+  void* data_ptr() const {
+    return mutable_data_ptr();
+  }
+  template <typename T, std::enable_if_t<!std::is_const_v<T>, int> = 0>
+  const T* const_data_ptr() const;
+  template <typename T, std::enable_if_t<std::is_const_v<T>, int> = 0>
+  const std::remove_const_t<T>* const_data_ptr() const;
+  template <typename T>
+  T* mutable_data_ptr() const;
+  // Legacy interface during the migration to indicate that a callsite
+  // has not been audited for mutability.
+  //
+  // Do not add new uses of this, use const_data_ptr() if possible,
+  // mutable_data_ptr() otherwise.
+  //
+  // TODO(#97856) Make this return a const pointer. This is currently
+  //              const because of the vast number of clients that
+  //              rely on this.
+  template <typename T>
+  T* data_ptr() const;
+  // Purposely not defined here to avoid inlining
+  void print() const;
+  // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
+  // dimension.
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
+    TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim());
+    T* ptr = nullptr;
+    if constexpr (std::is_const_v<T>) {
+      ptr = const_data_ptr<T>();
+    } else {
+      ptr = mutable_data_ptr<T>();
+    }
+    return TensorAccessor<T,N>(ptr,sizes().data(),strides().data());
+  }
+  template<typename T, size_t N>
+  TensorAccessor<T,N> accessor() && = delete;
+  // Return a `GenericPackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and
+  // dimension. You can optionally specify RestrictPtrTraits as a template parameter to
+  // cast the data pointer to a __restrict__ pointer.
+  // In order to use this, your CUDA kernel has to take a corresponding GenericPackedTensorAccessor
+  // as an argument.
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> generic_packed_accessor() const& {
+    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
+    TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim());
+    T* ptr = nullptr;
+    if constexpr (std::is_const_v<T>) {
+      ptr = const_data_ptr<T>();
+    } else {
+      ptr = mutable_data_ptr<T>();
+    }
+    return GenericPackedTensorAccessor<T,N,PtrTraits,index_t>(static_cast<typename PtrTraits<T>::PtrType>(ptr),sizes().data(),strides().data());
+  }
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+  GenericPackedTensorAccessor<T,N> generic_packed_accessor() && = delete;
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor32<T,N,PtrTraits> packed_accessor32() const& {
+    TORCH_CHECK(
+        impl_->numel() <=
+            static_cast<int64_t>(std::numeric_limits<int32_t>::max()),
+        "numel needs to be smaller than int32_t max; otherwise, please use packed_accessor64");
+    return generic_packed_accessor<T,N,PtrTraits,int32_t>();
+  }
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor32<T,N,PtrTraits> packed_accessor32() && = delete;
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor64<T,N,PtrTraits> packed_accessor64() const& {
+    return generic_packed_accessor<T,N,PtrTraits,int64_t>();
+  }
+  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
+  PackedTensorAccessor64<T,N,PtrTraits> packed_accessor64() && = delete;
+  // ~~~~~ Autograd API ~~~~~
+  /// \fn bool is_leaf() const;
+  ///
+  /// All Tensors that have `requires_grad()` which is ``false`` will be leaf Tensors by convention.
+  ///
+  /// For Tensors that have `requires_grad()` which is ``true``, they will be leaf Tensors if they were
+  /// created by the user. This means that they are not the result of an operation and so
+  /// `grad_fn()` is `nullptr`.
+  ///
+  /// Only leaf Tensors will have their `grad()` populated during a call to `backward()`.
+  /// To get `grad()` populated for non-leaf Tensors, you can use `retain_grad()`.
+  ///
+  /// Example:
+  /// @code
+  /// auto a = torch::rand(10, torch::requires_grad());
+  /// std::cout << a.is_leaf() << std::endl; // prints `true`
+  ///
+  /// auto b = torch::rand(10, torch::requires_grad()).to(torch::kCUDA);
+  /// std::cout << b.is_leaf() << std::endl; // prints `false`
+  /// // b was created by the operation that cast a cpu Tensor into a cuda Tensor
+  ///
+  /// auto c = torch::rand(10, torch::requires_grad()) + 2;
+  /// std::cout << c.is_leaf() << std::endl; // prints `false`
+  /// // c was created by the addition operation
+  ///
+  /// auto d = torch::rand(10).cuda();
+  /// std::cout << d.is_leaf() << std::endl; // prints `true`
+  /// // d does not require gradients and so has no operation creating it (that is tracked by the autograd engine)
+  ///
+  /// auto e = torch::rand(10).cuda().requires_grad_();
+  /// std::cout << e.is_leaf() << std::endl; // prints `true`
+  /// // e requires gradients and has no operations creating it
+  ///
+  /// auto f = torch::rand(10, torch::device(torch::kCUDA).requires_grad(true));
+  /// std::cout << f.is_leaf() << std::endl; // prints `true`
+  /// // f requires grad, has no operation creating it
+  /// @endcode
+  /// \fn void backward(const Tensor & gradient={}, std::optional<bool> retain_graph=std::nullopt, bool create_graph=false, std::optional<TensorList> inputs=std::nullopt) const;
+  ///
+  /// Computes the gradient of current tensor with respect to graph leaves.
+  ///
+  /// The graph is differentiated using the chain rule. If the tensor is
+  /// non-scalar (i.e. its data has more than one element) and requires
+  /// gradient, the function additionally requires specifying ``gradient``.
+  /// It should be a tensor of matching type and location, that contains
+  /// the gradient of the differentiated function w.r.t. this Tensor.
+  ///
+  /// This function accumulates gradients in the leaves - you might need to
+  /// zero them before calling it.
+  ///
+  /// \param gradient Gradient w.r.t. the
+  ///     tensor. If it is a tensor, it will be automatically converted
+  ///     to a Tensor that does not require grad unless ``create_graph`` is True.
+  ///     None values can be specified for scalar Tensors or ones that
+  ///     don't require grad. If a None value would be acceptable then
+  ///     this argument is optional.
+  /// \param retain_graph If ``false``, the graph used to compute
+  ///     the grads will be freed. Note that in nearly all cases setting
+  ///     this option to True is not needed and often can be worked around
+  ///     in a much more efficient way. Defaults to the value of
+  ///     ``create_graph``.
+  /// \param create_graph If ``true``, graph of the derivative will
+  ///     be constructed, allowing to compute higher order derivative
+  ///     products. Defaults to ``false``.
+  /// \param inputs Inputs w.r.t. which the gradient will be accumulated into
+  ///     ``at::Tensor::grad``. All other Tensors will be ignored. If not
+  ///     provided, the gradient is accumulated into all the leaf Tensors
+  ///     that were used to compute the current tensor.
+  ///     When inputs are provided and a given input is not a leaf,
+  ///     the current implementation will call its grad_fn (even though it is not strictly needed to get this gradients).
+  ///     It is an implementation detail on which the user should not rely.
+  ///     See https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for more details.
+  /// \fn Tensor detach() const;
+  ///
+  /// Returns a new Tensor, detached from the current graph.
+  /// The result will never require gradient.
+  /// \fn Tensor & detach_() const;
+  ///
+  /// Detaches the Tensor from the graph that created it, making it a leaf.
+  /// Views cannot be detached in-place.
+  /// \fn void retain_grad() const;
+  ///
+  /// Enables this Tensor to have their :attr:`grad` populated during
+  /// :func:`backward`. This is a no-op for leaf tensors.
+  /// \fn bool retains_grad() const;
+  ///
+  /// Is ``true`` if this Tensor is non-leaf and its :attr:`grad` is enabled to be
+  /// populated during :func:`backward`, ``false`` otherwise.
+  const TensorBase& set_requires_grad(bool requires_grad) const {
+    impl_->set_requires_grad(requires_grad);
+    return *this;
+  }
+  bool requires_grad() const {
+    return impl_->requires_grad();
+  }
+  // The Forward AD API functions below are low level and are not to be used by end
+  // users who should use the API provided in torch/csrc/autograd.h
+  /// This function returns the forward gradient for this Tensor at the given level.
+  const Tensor& _fw_grad(uint64_t level) const {
+    return impl_->_fw_grad(level, *this);
+  }
+  /// This function can be used to set the value of the forward grad.
+  /// Note that the given new_grad might not be used directly if it has different
+  /// metadata (size/stride/storage offset) compared to this Tensor. In that case,
+  /// new_grad content will be copied into a new Tensor
+  void _set_fw_grad(const TensorBase& new_grad, uint64_t level, bool is_inplace_op) const {
+    impl_->_set_fw_grad(new_grad, *this, level, is_inplace_op);
+  }
+  /// NOTE: This is similar to the legacy `.data()` function on `Variable`, and is intended
+  /// to be used from functions that need to access the `Variable`'s equivalent `Tensor`
+  /// (i.e. `Tensor` that shares the same storage and tensor metadata with the `Variable`).
+  ///
+  /// One notable difference with the legacy `.data()` function is that changes to the
+  /// returned `Tensor`'s tensor metadata (e.g. sizes / strides / storage / storage_offset)
+  /// will not update the original `Variable`, due to the fact that this function
+  /// shallow-copies the `Variable`'s underlying TensorImpl.
+  at::TensorBase tensor_data() const;
+  /// NOTE: `var.variable_data()` in C++ has the same semantics as `tensor.data`
+  /// in Python, which create a new `Variable` that shares the same storage and
+  /// tensor metadata with the original `Variable`, but with a completely new
+  /// autograd history.
+  ///
+  /// NOTE: If we change the tensor metadata (e.g. sizes / strides /
+  /// storage / storage_offset) of a variable created from `var.variable_data()`, those
+  /// changes will not update the original variable `var`. In `.variable_data()`, we set
+  /// `allow_tensor_metadata_change_` to false to make such changes explicitly illegal,
+  /// in order to prevent users from changing metadata of `var.variable_data()`
+  /// and expecting the original variable `var` to also be updated.
+  at::TensorBase variable_data() const;
+  // Gradient Node and Edges
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  /// Gets the gradient function of the `Variable`. If this is a leaf variable,
+  /// the pointer returned will be null.
+  ///
+  /// For View Variables:
+  /// Gets the up-to-date grad_fn. If the shared data or base was modified, we
+  /// re-create the grad_fn to express the up-to-date view relationship between
+  /// this and the base Variable.
+  const std::shared_ptr<torch::autograd::Node>& grad_fn() const;
+  // Hooks
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  template <typename T>
+  using hook_return_void_t = std::enable_if_t<std::is_void_v<typename std::invoke_result_t<T&, TensorBase>>, unsigned>;
+  template <typename T>
+  using hook_return_var_t = std::enable_if_t<std::is_same_v<typename std::invoke_result_t<T&, TensorBase>, TensorBase>, unsigned>;
+  /// Registers a backward hook.
+  ///
+  /// The hook will be called every time a gradient with respect to the Tensor is computed.
+  /// The hook should have one of the following signature:
+  /// ```
+  /// hook(TensorBase grad) -> TensorBase
+  /// ```
+  /// ```
+  /// hook(TensorBase grad) -> void
+  /// ```
+  /// The hook should not modify its argument, but it can optionally return a new gradient
+  /// which will be used in place of `grad`.
+  ///
+  /// This function returns the index of the hook in the list which can be used to remove hook.
+  ///
+  /// Example:
+  /// @code
+  /// auto v = torch::tensor({0., 0., 0.}, torch::requires_grad());
+  /// auto h = v.register_hook([](torch::Tensor grad){ return grad * 2; }); // double the gradient
+  /// v.backward(torch::tensor({1., 2., 3.}));
+  /// // This prints:
+  /// // ```
+  /// //  2
+  /// //  4
+  /// //  6
+  /// // [ CPUFloatType{3} ]
+  /// // ```
+  /// std::cout << v.grad() << std::endl;
+  /// v.remove_hook(h);  // removes the hook
+  /// @endcode
+  template <typename T>
+  hook_return_void_t<T> register_hook(T&& hook) const;
+  template <typename T>
+  hook_return_var_t<T> register_hook(T&& hook) const;
+protected:
+  unsigned _register_hook(std::function<TensorBase(const TensorBase&)> hook) const;
+public:
+  /// Remove hook at given position
+  void remove_hook(unsigned pos) const;
+  // Variable methods
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  bool is_leaf() const;
+  int64_t output_nr() const;
+  void set_data(const TensorBase & new_data) const;
+  TensorBase data() const;
+  int64_t _version() const;
+  void retain_grad() const;
+  bool retains_grad() const;
+  const TensorBase& requires_grad_(bool _requires_grad=true) const;
+  // View Variables
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  /// Returns true if this `Variable` is a view of another `Variable`.
+  bool is_view() const;
+  /// Returns the `Variable` that this `Variable` is a view of. If this
+  /// `Variable` is not a view, throw a `std::runtime_error`.
+  const TensorBase& _base() const;
+  // Miscellaneous
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  const std::string& name() const;
+protected:
+  void enforce_invariants();
+  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
+private:
+  TensorBase __dispatch_contiguous(c10::MemoryFormat) const;
+};
+inline DeviceIndex get_device(const TensorBase& self) {
+  return self.get_device();
+}
+template <typename T>
+auto TensorBase::register_hook(T&& hook) const -> TensorBase::hook_return_void_t<T> {
+  // Return the grad argument in case of a hook with void return type to have an
+  // std::function with Tensor return type
+  static_assert(std::is_same_v<decltype(hook(TensorBase())), void>,
+                "Expected hook to return void");
+  return _register_hook([fn=std::forward<T>(hook)](const TensorBase& grad) {
+    fn(grad);
+    return TensorBase();
+  });
+}
+template <typename T>
+auto TensorBase::register_hook(T&& hook) const -> TensorBase::hook_return_var_t<T> {
+  return _register_hook(std::forward<T>(hook));
+}
+namespace detail {
+// Helper creator for Tensor class which doesn't requires the users to pass
+// in an intrusive_ptr instead it just converts the argument passed to
+// requested intrusive_ptr type.
+template <typename T, typename... Args>
+TensorBase make_tensor_base(Args&&... args) {
+  return TensorBase(c10::make_intrusive<T>(std::forward<Args>(args)...));
+}
+} // namespace detail
+inline DispatchKey legacyExtractDispatchKey(const TensorBase& t) {
+  return legacyExtractDispatchKey(t.key_set());
+}
+} // namespace at
+namespace c10 {
+template <>
+struct MaybeOwnedTraits<at::TensorBase> {
+  using owned_type = at::TensorBase;
+  using borrow_type = at::TensorBase;
+  static borrow_type createBorrow(const owned_type& from) {
+    // NOTE: this can be implemented without the special
+    // unsafe_borrow_t Tensor constructor as
+    //
+    // return borrow_type(c10::intrusive_ptr<at::TensorImpl, at::UndefinedTensorImpl>::reclaim(from.unsafeGetTensorImpl()));
+    //
+    // but that hurts inlining due to the nullptr check in the
+    // Tensor(c10::intrusive_ptr<...>) constructor. We already know
+    // that from.impl_ isn't null because from is a valid Tensor, so
+    // we needn't do the check again. (using __builtin_assume can
+    // avoid this, but wouldn't be portable to MSVC.)
+    return borrow_type(borrow_type::unsafe_borrow_t{}, from);
+  }
+  static void assignBorrow(borrow_type& lhs, const borrow_type& rhs) {
+    lhs.unsafeReleaseTensorImpl();
+    // See above note: this can be implemented with public API
+    // similarly to createBorrow(), but that would hurt inlining.
+    lhs = borrow_type(borrow_type::unsafe_borrow_t{}, rhs);
+  }
+  static void destroyBorrow(borrow_type& toDestroy) {
+    toDestroy.unsafeReleaseTensorImpl(); // "leak" it, but it was already +0.
+  }
+  static const owned_type& referenceFromBorrow(const borrow_type& borrow) {
+    return borrow;
+  }
+  static const owned_type* pointerFromBorrow(const borrow_type& borrow) {
+    return &borrow;
+  }
+  static bool debugBorrowIsValid(const borrow_type& /*borrow*/) {
+    return true;
+  }
+};
+template <>
+struct ExclusivelyOwnedTraits<at::TensorBase> : public c10::ExclusivelyOwnedTensorTraits<at::TensorBase> {};
+} // namespace c10
+namespace at {
+inline c10::MaybeOwned<TensorBase> borrow_from_optional_tensor(
+    const std::optional<TensorBase>& opt) {
+  return opt.has_value()
+    ? c10::MaybeOwned<TensorBase>::borrowed(*opt)
+    : c10::MaybeOwned<TensorBase>::owned(std::in_place);
+}
+inline c10::MaybeOwned<TensorBase> TensorBase::expect_contiguous(MemoryFormat memory_format) const & {
+  if (is_contiguous(memory_format)) {
+    return c10::MaybeOwned<TensorBase>::borrowed(*this);
+  } else {
+    return c10::MaybeOwned<TensorBase>::owned(__dispatch_contiguous(memory_format));
+  }
+}
+namespace symint {
+template <typename T>
+using enable_if_symint = std::enable_if_t<std::is_same_v<T, c10::SymInt>>;
+template <typename T>
+using enable_if_int = std::enable_if_t<std::is_same_v<T, int64_t>>;
+template <typename T, typename = enable_if_symint<T>>
+c10::SymIntArrayRef sizes(const TensorBase& t) { return t.sym_sizes(); }
+template <typename T, typename = enable_if_int<T>>
+IntArrayRef sizes(const TensorBase& t) { return t.sizes(); }
+template <typename T, typename = enable_if_symint<T>>
+c10::SymInt size(const TensorBase& t, int64_t dim) { return t.sym_size(dim); }
+template <typename T, typename = enable_if_int<T>>
+int64_t size(const TensorBase& t, int64_t dim) { return t.size(dim); }
+template <typename T, typename = enable_if_symint<T>>
+c10::SymIntArrayRef strides(const TensorBase& t) { return t.sym_strides(); }
+template <typename T, typename = enable_if_int<T>>
+IntArrayRef strides(const TensorBase& t) { return t.strides(); }
+template <typename T, typename = enable_if_symint<T>>
+c10::SymInt numel(const TensorBase& t) { return t.sym_numel(); }
+template <typename T, typename = enable_if_int<T>>
+int64_t numel(const TensorBase& t) { return t.numel(); }
+} // namespace symint
+} // namespace at

.venv/lib/python3.12/site-packages/torch/include/ATen/core/TensorBody.h ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.12/site-packages/torch/include/ATen/core/TorchDispatchUtils.h ADDED Viewed

	@@ -0,0 +1,17 @@

+#pragma once
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <c10/util/ArrayRef.h>
+#include <torch/library.h>
+#include <optional>
+namespace at::impl {
+TORCH_API bool tensor_has_dispatch(const at::Tensor& t);
+TORCH_API bool tensorlist_has_dispatch(at::ITensorListRef li);
+TORCH_API bool tensorlist_has_dispatch(
+    const c10::List<std::optional<at::Tensor>>& li);
+using c10::impl::dispatch_mode_enabled;
+} // namespace at::impl

.venv/lib/python3.12/site-packages/torch/include/ATen/core/TransformationHelper.h ADDED Viewed

	@@ -0,0 +1,175 @@

+#include <ATen/NumericUtils.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Half.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/MathConstants.h>
+#include <cmath>
+#include <cstdint>
+#include <cassert>
+#include <limits>
+#include <type_traits>
+namespace at {
+// Using DistAccumType in accumulate types for distributions.
+// Note: Ideally we'd be using ATen/AccumulateType.h but looks
+// like the there is some inconsistency in how accumulate types
+// are mapped currently, e.g. for the cpu side, float is mapped
+// to double.
+template <typename T>
+struct DistAccumType {  };
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <> struct DistAccumType<half> { using type = float; };
+#endif
+template <> struct DistAccumType<BFloat16> { using type = float; };
+template <> struct DistAccumType<Half> { using type = float; };
+template <> struct DistAccumType<float> { using type = float; };
+template <> struct DistAccumType<double> { using type = double; };
+template <typename T>
+using dist_acctype = typename DistAccumType<T>::type;
+namespace transformation {
+/**
+ * A transformation function for `torch.Tensor.random_()`, when both `from` and `to` are specified.
+ * `range` is `to - from`
+ * `base` is `from`
+ */
+template <typename T, typename V>
+C10_HOST_DEVICE inline T uniform_int_from_to(V val, uint64_t range, int64_t base) {
+  return static_cast<T>(static_cast<int64_t>((val % range) + base));
+}
+/**
+ * A transformation function for `torch.Tensor.random_()`, when `from=min_value(int64_t)` and to=None
+ */
+template <typename T, typename V>
+C10_HOST_DEVICE inline T uniform_int_full_range(V val) {
+  return static_cast<T>(static_cast<int64_t>(val));
+}
+/**
+ * A transformation function for `torch.Tensor.random_()`, when used without specifying `from` and `to`.
+ * In order to prevent compiler warnings reported in GitHub issue 46391, T can't be float or double
+ * in this overloaded version
+ */
+template <typename T, typename V>
+C10_HOST_DEVICE inline std::enable_if_t<!(std::is_floating_point_v<T>), T>uniform_int(V val) {
+  if constexpr (std::is_same_v<T, bool>) {
+    return static_cast<bool>(val & 1);
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    return static_cast<T>(val % (static_cast<uint64_t>(std::numeric_limits<T>::max()) + 1));
+  } else if constexpr (std::is_same_v<T, at::Half> || std::is_same_v<T, at::BFloat16>) {
+    return static_cast<T>(val % static_cast<uint64_t>((1ULL << std::numeric_limits<T>::digits) + 1));
+  } else if constexpr (std::is_integral_v<T>) {
+    return static_cast<T>(val % (static_cast<uint64_t>(std::numeric_limits<T>::max()) + 1));
+  } else {
+    assert(false);
+    return 0;
+  }
+}
+/**
+ * An overloaded transformation function for `torch.Tensor.random_()`, when used without specifying `from` and `to`,
+ * added to fix compiler warnings reported in GitHub issue 46391. T is either float or double in this version.
+ */
+template<typename T, typename V>
+C10_HOST_DEVICE inline std::enable_if_t<std::is_floating_point_v<T>, T>uniform_int(V val) {
+  return static_cast<T>(val % static_cast<uint64_t>((1ULL << std::numeric_limits<T>::digits) + 1));
+}
+template <typename T, typename V>
+C10_HOST_DEVICE inline dist_acctype<T> uniform_real(V val, T from, T to) {
+  constexpr auto MASK = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
+  constexpr auto DIVISOR = static_cast<dist_acctype<T>>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
+  dist_acctype<T> x = (val & MASK) * DIVISOR;
+  return (x * (to - from) + from);
+}
+/**
+ * Transforms normally distributed `val` with mean 0.0 and standard deviation 1.0 to
+ * normally distributed with `mean` and standard deviation `std`.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T normal(T val, T mean, T std) {
+  return val * std + mean;
+}
+/**
+ * Transforms uniformly distributed `val` between 0.0 and 1.0 to
+ * Cauchy distribution with location parameter `median` and scale parameter `sigma`.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T cauchy(T val, T median, T sigma) {
+  // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
+  // __tanf overflows and returns `inf/-inf` when (val > 1 - eps) or (val < 0 + eps),
+  // thus we clip those values.
+  constexpr T eps = std::numeric_limits<T>::epsilon();
+  constexpr T one_minus_eps = 1 - eps;
+  constexpr T zero_plus_eps = 0 + eps;
+  val = (val > one_minus_eps ? one_minus_eps : val);
+  val = (val < zero_plus_eps ? zero_plus_eps : val);
+  return median + sigma * at::tan(c10::pi<T> * (val - static_cast<T>(0.5)));
+}
+template <>
+C10_HOST_DEVICE inline double cauchy(double val, double median, double sigma) {
+  // https://en.wikipedia.org/wiki/Cauchy_distribution#Cumulative_distribution_function
+  return median + sigma * at::tan(c10::pi<double> * (val - static_cast<double>(0.5)));
+}
+/**
+ * Transforms uniformly distributed `val` between 0.0 and 1.0 to
+ * exponentially distributed with `lambda` parameter of the distribution.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T exponential(T val, T lambda) {
+  // https://en.wikipedia.org/wiki/Exponential_distribution#Generating_exponential_variates
+  // Different implementations for CUDA and CPU to preserve original logic
+  // TODO: must be investigated and unified!!!
+  // https://github.com/pytorch/pytorch/issues/38662
+#if defined(__CUDACC__) || defined(__HIPCC__)
+      // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
+      // curand_uniform has (0,1] bounds. log(1) is 0 and exponential excludes 0.
+      // we need log to be not 0, and not underflow when converted to half
+      // fast __logf approximation can underflow, so set log to -epsilon/2 for 1 or close to 1 args
+  auto log = val >= static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2
+      ? -std::numeric_limits<T>::epsilon() / 2
+      : at::log(val);
+  return static_cast<T>(-1.0) / lambda * log;
+#else
+  return static_cast<T>(-1.0) / lambda * at::log1p(-val);
+#endif
+}
+/**
+ * Transforms uniformly distributed `val` between 0.0 and 1.0 to
+ * geometrically distributed with success probability `p`.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T geometric(T val, T p) {
+  // https://en.wikipedia.org/wiki/Geometric_distribution#Related_distributions
+  return static_cast<T>(::ceil(at::log(val) / at::log1p(-p)));
+}
+/**
+ * Transforms normally distributed `val` to log-normally distributed.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T log_normal(T val) {
+  // https://en.wikipedia.org/wiki/Log-normal_distribution#Mode,_median,_quantiles
+  return at::exp(val);
+}
+/**
+ * Transforms uniformly distributed `val` between 0.0 and 1.0 to
+ * bernoulli distributed with success probability `p`.
+ */
+template <typename T>
+C10_HOST_DEVICE inline T bernoulli(T val, T p) {
+  return val < p;
+}
+}} // namespace at::transformation

.venv/lib/python3.12/site-packages/torch/include/ATen/core/UndefinedTensorImpl.h ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include <c10/core/UndefinedTensorImpl.h>

.venv/lib/python3.12/site-packages/torch/include/ATen/core/UnsafeFromTH.h ADDED Viewed

	@@ -0,0 +1,21 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+namespace at {
+inline Tensor unsafeTensorFromTH(void * th_pointer, bool retain) {
+  auto tensor_impl = c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(static_cast<TensorImpl*>(th_pointer));
+  if (retain && tensor_impl.get() != UndefinedTensorImpl::singleton()) {
+    c10::raw::intrusive_ptr::incref(tensor_impl.get());
+  }
+  return Tensor(std::move(tensor_impl));
+}
+inline Storage unsafeStorageFromTH(void * th_pointer, bool retain) {
+  if (retain && th_pointer) {
+    c10::raw::intrusive_ptr::incref(static_cast<StorageImpl*>(th_pointer));
+  }
+  return Storage(c10::intrusive_ptr<StorageImpl>::reclaim(static_cast<StorageImpl*>(th_pointer)));
+}
+}

.venv/lib/python3.12/site-packages/torch/include/ATen/core/VariableHooksInterface.h ADDED Viewed

	@@ -0,0 +1,83 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/macros/Export.h>
+// A little explanation about why this file exists at all.  We have
+// a few methods on Tensor class which require access to reified access to
+// AutogradMeta.  In open source, this isn't a big deal: we just access
+// torch/csrc/autograd/variable.h from aten/src/ATen/core/Tensor.cpp and
+// we can put the definitions inline.  This is because everything gets balled
+// into a single dynamic library in the end.
+//
+// However, inside our Facebook internal version of our build system, we
+// have a split between aten and torch/csrc.  So we cannot simply just
+// cross this boundary.  "Now wait," you might say, "Why don't we just
+// merge the libraries inside Facebook".  Well, the problem is that there
+// are some downstream applications which are at binary size limit, and
+// incorporating all of the extra code from libtorch would push them
+// over (admarket/adreview/service:adreviewservice, see also
+// https://github.com/pytorch/pytorch/pull/29299)  So if you want to do that,
+// we have to fix all of the services like this.
+//
+// I didn't want to block eliminating Tensor-Variable on this work, so I
+// had to introduce another dynamic dispatch to get to the variable
+// implementations (which live in torch/csrc/autograd/variable.cpp, FYI).
+//
+// I also considered using our existing dynamic dispatch mechanism, c10
+// dispatcher, to do this.  However, (1) some of the functions on Tensor
+// have weird signatures that are not supported by autograd, and (2)
+// see this bug https://github.com/pytorch/pytorch/issues/30102
+namespace torch::autograd {
+struct Node;
+} // namespace torch::autograd
+namespace at::impl {
+struct TORCH_API VariableHooksInterface {
+  virtual ~VariableHooksInterface() = default;
+  virtual TensorBase tensor_data(const TensorBase&) const = 0;
+  virtual TensorBase variable_data(const TensorBase&) const = 0;
+  virtual const std::shared_ptr<torch::autograd::Node>& grad_fn(
+      const TensorBase&) const = 0;
+  virtual unsigned _register_hook(
+      const TensorBase&,
+      std::function<TensorBase(const TensorBase&)> hook) const = 0;
+  virtual void remove_hook(const TensorBase&, unsigned pos) const = 0;
+  virtual bool is_view(const TensorBase&) const = 0;
+  virtual const TensorBase& base(const TensorBase&) const = 0;
+  virtual const std::string& name(const TensorBase&) const = 0;
+  virtual bool is_leaf(const TensorBase&) const = 0;
+  virtual int64_t output_nr(const TensorBase&) const = 0;
+  virtual void set_data(const TensorBase&, const TensorBase&) const = 0;
+  virtual TensorBase data(const TensorBase&) const = 0;
+  virtual int64_t _version(const TensorBase&) const = 0;
+  virtual void retain_grad(const TensorBase&) const = 0;
+  virtual bool retains_grad(const TensorBase&) const = 0;
+  virtual void _backward(
+      const Tensor&,
+      TensorList,
+      const std::optional<Tensor>&,
+      std::optional<bool>,
+      bool) const = 0;
+  virtual void requires_grad_(const TensorBase&, bool) const = 0;
+  virtual void basic_autograd_not_implemented_fallback(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet dispatch_keys,
+      torch::jit::Stack* stack) const = 0;
+};
+TORCH_API void SetVariableHooks(VariableHooksInterface* hooks);
+TORCH_API VariableHooksInterface* GetVariableHooks();
+TORCH_API bool HasVariableHooks();
+struct TORCH_API VariableHooksRegisterer {
+  explicit VariableHooksRegisterer(VariableHooksInterface* hooks) {
+    SetVariableHooks(hooks);
+  }
+};
+} // namespace at::impl

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Variadic.h ADDED Viewed

	@@ -0,0 +1,92 @@

+#pragma once
+#include <utility>
+#include <c10/util/ArrayRef.h>
+#include <ATen/core/List.h>
+namespace at {
+// This class allows you to write variadic functions which
+// call a (possibly overloaded) function on each argument,
+// in order.  This is most commonly used in autogenerated code,
+// where it is convenient to have a function that can uniformly
+// take arguments of different types.  If your arguments
+// are homogenous consider using a std::initializer_list instead.
+//
+// For examples of this in use, see torch/csrc/utils/variadic.h
+template <typename F>
+struct IterArgs {
+  template <typename... Args>
+  inline F& apply() {
+    return self();
+  }
+  // NB: Use perfect forwarding here, otherwise we'll make value
+  // copies of all arguments!
+  template <typename T, typename... Args>
+  inline F& apply(T&& arg, Args&&... args) {
+    self()(std::forward<T>(arg));
+    if (self().short_circuit()) {
+      return self();
+    } else {
+      return apply(std::forward<Args>(args)...);
+    }
+  }
+  // Here are some handy overloads which provide sensible
+  // defaults for container-like structures that one might
+  // be interested in recursing into.  You can enable them
+  // by adding:
+  //
+  //    using IterArgs<YourStructName>::operator()
+  //
+  // to your struct.  These are not enabled by default because
+  // you may be able to process these structures more efficiently
+  // than handling them one-by-one.
+  template <typename T>
+  void operator()(c10::IListRef<T> args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+  template <typename T>
+  void operator()(at::ArrayRef<T> args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+  template <typename T>
+  void operator()(const torch::List<T>& args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+  // NB: we need to specify std::vector manually as C++ won't
+  // do an implicit conversion to make a template deduction go through.
+  template <typename T>
+  void operator()(const std::vector<T>& args) {
+    self()(at::ArrayRef<T>{args});
+  }
+  constexpr bool short_circuit() const {
+    return false;
+  }
+ private:
+  inline F& self() {
+    return *static_cast<F*>(this);
+  }
+};
+} // namespace torch

.venv/lib/python3.12/site-packages/torch/include/ATen/core/Vitals.h ADDED Viewed

	@@ -0,0 +1,94 @@

+#pragma once
+#include <ostream>
+#include <sstream>
+#include <unordered_map>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+namespace at::vitals {
+TORCH_API bool torchVitalEnabled();
+struct TORCH_API TorchVitalAttr {
+  // always initialized to empty
+  std::string value;
+  template <typename T>
+  TorchVitalAttr& operator<<(const T& t) {
+    if (torchVitalEnabled()) {
+      std::stringstream ss;
+      ss << t;
+      value += ss.str();
+    }
+    return *this;
+  }
+  template <typename T>
+  void write(const T& t, bool force) {
+    if (force || torchVitalEnabled()) {
+      std::stringstream ss;
+      ss << t;
+      value = ss.str();
+    }
+  }
+};
+struct TORCH_API TorchVital {
+  std::string name;
+  std::unordered_map<std::string, TorchVitalAttr> attrs;
+  explicit TorchVital(std::string n) : name(std::move(n)) {}
+  TorchVital(const TorchVital&) = default;
+  TorchVital(TorchVital&&) = default;
+  TorchVital& operator=(const TorchVital&) = default;
+  TorchVital& operator=(TorchVital&&) = default;
+  TorchVital() = delete;
+  TorchVitalAttr& create(const std::string& attr);
+  TorchVitalAttr& create(const std::string& attr, bool force);
+  friend std::ostream& operator<<(std::ostream& os, const TorchVital& dt);
+  ~TorchVital();
+};
+std::ostream& operator<<(std::ostream& os, TorchVital const& tv);
+// A way to access vitals by string names instead of by global reference.
+// This enables access to vitals from the PythonAPI.
+class TORCH_API APIVitals {
+ public:
+  bool vitals_enabled;
+  // Set any vital sign that was added to the map.
+  bool setVital(
+      const std::string& vital_name,
+      const std::string& attr_name,
+      const std::string& value,
+      bool force = false);
+  std::string readVitals();
+  APIVitals();
+  // Ensure this stays a singleton
+  APIVitals(APIVitals const& other) = delete;
+  APIVitals(APIVitals&& other) = delete;
+  APIVitals& operator=(const APIVitals&) = delete;
+  APIVitals& operator=(APIVitals&&) = delete;
+  ~APIVitals() = default;
+ private:
+  std::unordered_map<std::string, TorchVital> name_map_;
+};
+extern TORCH_API APIVitals VitalsAPI;
+} // namespace at::vitals
+#define TORCH_VITAL_DECLARE(name) \
+  TORCH_API at::vitals::TorchVital TorchVital_##name;
+#define TORCH_VITAL_DEFINE(name) \
+  TORCH_API at::vitals::TorchVital TorchVital_##name(#name);
+#define TORCH_VITAL_BASE(name) TorchVital_##name
+#define TORCH_VITAL(name, attr) TORCH_VITAL_BASE(name).create(#attr)

.venv/lib/python3.12/site-packages/torch/include/ATen/core/alias_info.h ADDED Viewed

	@@ -0,0 +1,162 @@

+#pragma once
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include <ATen/core/symbol.h>
+#include <c10/util/Exception.h>
+#include <c10/util/hash.h>
+namespace c10 {
+/**
+ * class AliasInfo
+ *
+ * Data structure to hold aliasing information for an `Argument`. They can be
+ * nested to represent aliasing information on contained types.
+ *
+ * There is a `beforeSet` which describes the aliasing information before the
+ * operator executes, and an `afterSet` that describes aliasing info
+ * after execution.
+ */
+class AliasInfo {
+ public:
+  AliasInfo() = default;
+  AliasInfo(bool is_write, const std::set<std::string>& before_qual_strings, const std::set<std::string>& after_qual_strings) : isWrite_(is_write) {
+    for (const auto& s: before_qual_strings) {
+      beforeSets_.insert(Symbol::fromQualString(s));
+    }
+    for (const auto& s : after_qual_strings) {
+      afterSets_.insert(Symbol::fromQualString(s));
+    }
+  }
+  // Symbol for the set that can alias anything
+  static Symbol wildcardSet() {
+    static const Symbol wc = Symbol::fromQualString("alias::*");
+    return wc;
+  }
+  void setIsWrite(bool isWrite) {
+    isWrite_ = isWrite;
+  }
+  bool isWrite() const {
+    return isWrite_;
+  }
+  void addBeforeSet(Symbol aliasSet) {
+    beforeSets_.insert(aliasSet);
+  }
+  void addAfterSet(Symbol aliasSet) {
+    afterSets_.insert(aliasSet);
+  }
+  const std::unordered_set<Symbol>& beforeSets() const {
+    return beforeSets_;
+  }
+  const std::unordered_set<Symbol>& afterSets() const {
+    return afterSets_;
+  }
+  Symbol beforeSet() const {
+    AT_ASSERT(beforeSets_.size() == 1);
+    return *beforeSets_.begin();
+  }
+  bool isWildcardBefore() const {
+    return beforeSets_.count(wildcardSet()) != 0;
+  }
+  bool isWildcardAfter() const {
+    return afterSets_.count(wildcardSet()) != 0;
+  }
+  // the alias info for the contained types of the type
+  // e.g. if this is an annotation on List[T], `sets` refers to
+  // the alias sets that the list may be in
+  // while containedTypes()[0] refers to the sets that members of the list
+  // may be in
+  void addContainedType(AliasInfo aliasInfo) {
+    containedTypes_.push_back(std::move(aliasInfo));
+  }
+  const std::vector<AliasInfo>& containedTypes() const {
+    return containedTypes_;
+  }
+ private:
+  std::unordered_set<Symbol> beforeSets_;
+  std::unordered_set<Symbol> afterSets_;
+  std::vector<AliasInfo> containedTypes_;
+  bool isWrite_ = false;
+};
+inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {
+  return lhs.isWrite() == rhs.isWrite()
+      && lhs.beforeSets() == rhs.beforeSets()
+      && lhs.afterSets() == rhs.afterSets()
+      && lhs.containedTypes() == rhs.containedTypes();
+}
+// this does match the way things are represented in the schema
+inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
+  out << "(";
+  bool first = true;
+  for (const auto& set : aliasInfo.beforeSets()) {
+    if (first) {
+      first = false;
+    } else {
+      out << "|";
+    }
+    out << set.toUnqualString();
+  }
+  if (aliasInfo.isWrite()) {
+    out << "!";
+  }
+  if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
+    out << " -> ";
+    first = true;
+    for (const auto& set : aliasInfo.afterSets()) {
+      if (first) {
+        first = false;
+      } else {
+        out << "|";
+      }
+      out << set.toUnqualString();
+    }
+  }
+  out << ")";
+  return out;
+}
+} // namespace c10
+namespace std {
+template <>
+  struct hash<c10::AliasInfo> {
+    size_t operator()(const c10::AliasInfo& aliasInfo) const {
+      auto hash = std::hash<bool>()(aliasInfo.isWrite());
+      // NOTE: for unordered_set hashes, we couldn't use hash_combine
+      // because hash_combine is order dependent. Instead, we choose to
+      // use XOR as the combining function as XOR is commutative.
+      size_t before_set_hash_seed = 0;
+      for (auto &e: aliasInfo.beforeSets()) {
+        auto symbol_hash = std::hash<c10::Symbol>()(e);
+        before_set_hash_seed = before_set_hash_seed ^ symbol_hash;
+      }
+      size_t after_set_hash_seed = 0;
+      for (auto &e: aliasInfo.afterSets()) {
+        auto symbol_hash = std::hash<c10::Symbol>()(e);
+        after_set_hash_seed = after_set_hash_seed ^ symbol_hash;
+      }
+      hash = c10::hash_combine(hash, before_set_hash_seed);
+      hash = c10::hash_combine(hash, after_set_hash_seed);
+      for (auto &e: aliasInfo.containedTypes()) {
+        auto contained_type_hash = std::hash<c10::AliasInfo>()(e);
+        hash = c10::hash_combine(hash, contained_type_hash);
+      }
+      return hash;
+    }
+  };
+}

.venv/lib/python3.12/site-packages/torch/include/ATen/core/aten_interned_strings.h ADDED Viewed

	@@ -0,0 +1,2294 @@

+#pragma once
+// @generated by torchgen/gen.py from aten_interned_strings.h
+#if defined(TORCH_ASSERT_NO_OPERATORS) || defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on native_functions.yaml,          \
+  meaning the file will need to be re-compiled every time an operator   \
+  is changed or added. Consider if including <ATen/core/symbol.h> for   \
+  the c10::Symbol class would be sufficient, or if your change would be \
+  better placed in another file.
+#endif
+// ATen symbols correspond exactly to operators defined in ATen. Every
+// symbol here corresponds exactly to an ATen operation defined in
+// native_functions.yaml; attributes are in one-to-one correspondence
+// with their ATen name.
+#define FORALL_ATEN_BASE_SYMBOLS(_) \
+_(aten, __and__) \
+_(aten, __iand__) \
+_(aten, __ilshift__) \
+_(aten, __ior__) \
+_(aten, __irshift__) \
+_(aten, __ixor__) \
+_(aten, __lshift__) \
+_(aten, __or__) \
+_(aten, __rshift__) \
+_(aten, __xor__) \
+_(aten, _adaptive_avg_pool2d) \
+_(aten, _adaptive_avg_pool2d_backward) \
+_(aten, _adaptive_avg_pool3d) \
+_(aten, _adaptive_avg_pool3d_backward) \
+_(aten, _add_batch_dim) \
+_(aten, _add_relu) \
+_(aten, _add_relu_) \
+_(aten, _addmm_activation) \
+_(aten, _aminmax) \
+_(aten, _amp_foreach_non_finite_check_and_unscale) \
+_(aten, _amp_foreach_non_finite_check_and_unscale_) \
+_(aten, _amp_update_scale) \
+_(aten, _amp_update_scale_) \
+_(aten, _assert_async) \
+_(aten, _assert_scalar) \
+_(aten, _assert_tensor_metadata) \
+_(aten, _autocast_to_full_precision) \
+_(aten, _autocast_to_reduced_precision) \
+_(aten, _backward) \
+_(aten, _batch_norm_impl_index) \
+_(aten, _batch_norm_impl_index_backward) \
+_(aten, _batch_norm_no_update) \
+_(aten, _batch_norm_with_update) \
+_(aten, _batch_norm_with_update_functional) \
+_(aten, _cast_Byte) \
+_(aten, _cast_Char) \
+_(aten, _cast_Double) \
+_(aten, _cast_Float) \
+_(aten, _cast_Half) \
+_(aten, _cast_Int) \
+_(aten, _cast_Long) \
+_(aten, _cast_Short) \
+_(aten, _cdist_backward) \
+_(aten, _cdist_forward) \
+_(aten, _cholesky_solve_helper) \
+_(aten, _choose_qparams_per_tensor) \
+_(aten, _chunk_cat) \
+_(aten, _coalesce) \
+_(aten, _coalesced) \
+_(aten, _coalesced_) \
+_(aten, _compute_linear_combination) \
+_(aten, _conj) \
+_(aten, _conj_copy) \
+_(aten, _conj_physical) \
+_(aten, _conv_depthwise2d) \
+_(aten, _convert_indices_from_coo_to_csr) \
+_(aten, _convert_indices_from_csr_to_coo) \
+_(aten, _convert_weight_to_int4pack) \
+_(aten, _convert_weight_to_int4pack_for_cpu) \
+_(aten, _convolution) \
+_(aten, _convolution_double_backward) \
+_(aten, _convolution_mode) \
+_(aten, _copy_from) \
+_(aten, _copy_from_and_resize) \
+_(aten, _cslt_compress) \
+_(aten, _cslt_sparse_mm) \
+_(aten, _cslt_sparse_mm_search) \
+_(aten, _ctc_loss) \
+_(aten, _ctc_loss_backward) \
+_(aten, _cudnn_attention_forward) \
+_(aten, _cudnn_ctc_loss) \
+_(aten, _cudnn_init_dropout_state) \
+_(aten, _cudnn_rnn) \
+_(aten, _cudnn_rnn_backward) \
+_(aten, _cudnn_rnn_flatten_weight) \
+_(aten, _cufft_clear_plan_cache) \
+_(aten, _cufft_get_plan_cache_max_size) \
+_(aten, _cufft_get_plan_cache_size) \
+_(aten, _cufft_set_plan_cache_max_size) \
+_(aten, _cummax_helper) \
+_(aten, _cummin_helper) \
+_(aten, _debug_has_internal_overlap) \
+_(aten, _dimI) \
+_(aten, _dimV) \
+_(aten, _dim_arange) \
+_(aten, _dirichlet_grad) \
+_(aten, _dyn_quant_matmul_4bit) \
+_(aten, _dyn_quant_pack_4bit_weight) \
+_(aten, _efficient_attention_backward) \
+_(aten, _efficient_attention_forward) \
+_(aten, _efficientzerotensor) \
+_(aten, _embedding_bag) \
+_(aten, _embedding_bag_backward) \
+_(aten, _embedding_bag_dense_backward) \
+_(aten, _embedding_bag_forward_only) \
+_(aten, _embedding_bag_per_sample_weights_backward) \
+_(aten, _embedding_bag_sparse_backward) \
+_(aten, _empty_affine_quantized) \
+_(aten, _empty_per_channel_affine_quantized) \
+_(aten, _euclidean_dist) \
+_(aten, _fake_quantize_learnable_per_channel_affine) \
+_(aten, _fake_quantize_learnable_per_channel_affine_backward) \
+_(aten, _fake_quantize_learnable_per_tensor_affine) \
+_(aten, _fake_quantize_learnable_per_tensor_affine_backward) \
+_(aten, _fake_quantize_per_tensor_affine_cachemask_tensor_qparams) \
+_(aten, _fft_c2c) \
+_(aten, _fft_c2r) \
+_(aten, _fft_r2c) \
+_(aten, _fill_mem_eff_dropout_mask) \
+_(aten, _fill_mem_eff_dropout_mask_) \
+_(aten, _flash_attention_backward) \
+_(aten, _flash_attention_forward) \
+_(aten, _foobar) \
+_(aten, _foreach_abs) \
+_(aten, _foreach_abs_) \
+_(aten, _foreach_acos) \
+_(aten, _foreach_acos_) \
+_(aten, _foreach_add) \
+_(aten, _foreach_add_) \
+_(aten, _foreach_addcdiv) \
+_(aten, _foreach_addcdiv_) \
+_(aten, _foreach_addcmul) \
+_(aten, _foreach_addcmul_) \
+_(aten, _foreach_asin) \
+_(aten, _foreach_asin_) \
+_(aten, _foreach_atan) \
+_(aten, _foreach_atan_) \
+_(aten, _foreach_ceil) \
+_(aten, _foreach_ceil_) \
+_(aten, _foreach_clamp_max) \
+_(aten, _foreach_clamp_max_) \
+_(aten, _foreach_clamp_min) \
+_(aten, _foreach_clamp_min_) \
+_(aten, _foreach_copy) \
+_(aten, _foreach_copy_) \
+_(aten, _foreach_cos) \
+_(aten, _foreach_cos_) \
+_(aten, _foreach_cosh) \
+_(aten, _foreach_cosh_) \
+_(aten, _foreach_div) \
+_(aten, _foreach_div_) \
+_(aten, _foreach_erf) \
+_(aten, _foreach_erf_) \
+_(aten, _foreach_erfc) \
+_(aten, _foreach_erfc_) \
+_(aten, _foreach_exp) \
+_(aten, _foreach_exp_) \
+_(aten, _foreach_expm1) \
+_(aten, _foreach_expm1_) \
+_(aten, _foreach_floor) \
+_(aten, _foreach_floor_) \
+_(aten, _foreach_frac) \
+_(aten, _foreach_frac_) \
+_(aten, _foreach_lerp) \
+_(aten, _foreach_lerp_) \
+_(aten, _foreach_lgamma) \
+_(aten, _foreach_lgamma_) \
+_(aten, _foreach_log) \
+_(aten, _foreach_log10) \
+_(aten, _foreach_log10_) \
+_(aten, _foreach_log1p) \
+_(aten, _foreach_log1p_) \
+_(aten, _foreach_log2) \
+_(aten, _foreach_log2_) \
+_(aten, _foreach_log_) \
+_(aten, _foreach_max) \
+_(aten, _foreach_maximum) \
+_(aten, _foreach_maximum_) \
+_(aten, _foreach_minimum) \
+_(aten, _foreach_minimum_) \
+_(aten, _foreach_mul) \
+_(aten, _foreach_mul_) \
+_(aten, _foreach_neg) \
+_(aten, _foreach_neg_) \
+_(aten, _foreach_norm) \
+_(aten, _foreach_pow) \
+_(aten, _foreach_pow_) \
+_(aten, _foreach_reciprocal) \
+_(aten, _foreach_reciprocal_) \
+_(aten, _foreach_round) \
+_(aten, _foreach_round_) \
+_(aten, _foreach_rsqrt) \
+_(aten, _foreach_rsqrt_) \
+_(aten, _foreach_sigmoid) \
+_(aten, _foreach_sigmoid_) \
+_(aten, _foreach_sign) \
+_(aten, _foreach_sign_) \
+_(aten, _foreach_sin) \
+_(aten, _foreach_sin_) \
+_(aten, _foreach_sinh) \
+_(aten, _foreach_sinh_) \
+_(aten, _foreach_sqrt) \
+_(aten, _foreach_sqrt_) \
+_(aten, _foreach_sub) \
+_(aten, _foreach_sub_) \
+_(aten, _foreach_tan) \
+_(aten, _foreach_tan_) \
+_(aten, _foreach_tanh) \
+_(aten, _foreach_tanh_) \
+_(aten, _foreach_trunc) \
+_(aten, _foreach_trunc_) \
+_(aten, _foreach_zero) \
+_(aten, _foreach_zero_) \
+_(aten, _functional_assert_async) \
+_(aten, _functional_assert_scalar) \
+_(aten, _functional_sym_constrain_range) \
+_(aten, _functional_sym_constrain_range_for_size) \
+_(aten, _fused_adagrad) \
+_(aten, _fused_adagrad_) \
+_(aten, _fused_adam) \
+_(aten, _fused_adam_) \
+_(aten, _fused_adamw) \
+_(aten, _fused_adamw_) \
+_(aten, _fused_dropout) \
+_(aten, _fused_moving_avg_obs_fq_helper) \
+_(aten, _fused_moving_avg_obs_fq_helper_functional) \
+_(aten, _fused_rms_norm) \
+_(aten, _fused_sdp_choice) \
+_(aten, _fused_sgd) \
+_(aten, _fused_sgd_) \
+_(aten, _fw_primal) \
+_(aten, _fw_primal_copy) \
+_(aten, _gather_sparse_backward) \
+_(aten, _grid_sampler_2d_cpu_fallback) \
+_(aten, _grid_sampler_2d_cpu_fallback_backward) \
+_(aten, _grouped_mm) \
+_(aten, _has_compatible_shallow_copy_type) \
+_(aten, _has_same_storage_numel) \
+_(aten, _histogramdd_bin_edges) \
+_(aten, _histogramdd_from_bin_cts) \
+_(aten, _histogramdd_from_bin_tensors) \
+_(aten, _index_put_impl) \
+_(aten, _index_put_impl_) \
+_(aten, _indices) \
+_(aten, _indices_copy) \
+_(aten, _int_mm) \
+_(aten, _is_all_true) \
+_(aten, _is_any_true) \
+_(aten, _is_zerotensor) \
+_(aten, _jagged_to_padded_dense_forward) \
+_(aten, _lazy_clone) \
+_(aten, _linalg_check_errors) \
+_(aten, _linalg_det) \
+_(aten, _linalg_eigh) \
+_(aten, _linalg_eigvals) \
+_(aten, _linalg_slogdet) \
+_(aten, _linalg_solve_ex) \
+_(aten, _linalg_svd) \
+_(aten, _local_scalar_dense) \
+_(aten, _log_softmax) \
+_(aten, _log_softmax_backward_data) \
+_(aten, _logcumsumexp) \
+_(aten, _lstm_mps) \
+_(aten, _lu_with_info) \
+_(aten, _make_dep_token) \
+_(aten, _make_dual) \
+_(aten, _make_dual_copy) \
+_(aten, _make_per_channel_quantized_tensor) \
+_(aten, _make_per_tensor_quantized_tensor) \
+_(aten, _masked_scale) \
+_(aten, _masked_softmax) \
+_(aten, _masked_softmax_backward) \
+_(aten, _mixed_dtypes_linear) \
+_(aten, _mkldnn_reshape) \
+_(aten, _mkldnn_transpose) \
+_(aten, _mkldnn_transpose_) \
+_(aten, _mps_convolution) \
+_(aten, _mps_convolution_transpose) \
+_(aten, _native_batch_norm_legit) \
+_(aten, _native_batch_norm_legit_functional) \
+_(aten, _native_batch_norm_legit_no_training) \
+_(aten, _native_multi_head_attention) \
+_(aten, _neg_view) \
+_(aten, _neg_view_copy) \
+_(aten, _nested_compute_contiguous_strides_offsets) \
+_(aten, _nested_from_padded) \
+_(aten, _nested_from_padded_and_nested_example) \
+_(aten, _nested_from_padded_tensor) \
+_(aten, _nested_get_jagged_dummy) \
+_(aten, _nested_get_lengths) \
+_(aten, _nested_get_max_seqlen) \
+_(aten, _nested_get_min_seqlen) \
+_(aten, _nested_get_offsets) \
+_(aten, _nested_get_ragged_idx) \
+_(aten, _nested_get_values) \
+_(aten, _nested_get_values_copy) \
+_(aten, _nested_select_backward) \
+_(aten, _nested_sum_backward) \
+_(aten, _nested_tensor_from_mask) \
+_(aten, _nested_tensor_from_mask_left_aligned) \
+_(aten, _nested_tensor_from_tensor_list) \
+_(aten, _nested_tensor_size) \
+_(aten, _nested_tensor_softmax_with_shape) \
+_(aten, _nested_tensor_storage_offsets) \
+_(aten, _nested_tensor_strides) \
+_(aten, _nested_view_from_buffer) \
+_(aten, _nested_view_from_buffer_copy) \
+_(aten, _nested_view_from_jagged) \
+_(aten, _nested_view_from_jagged_copy) \
+_(aten, _new_zeros_with_same_feature_meta) \
+_(aten, _nnpack_available) \
+_(aten, _nnpack_spatial_convolution) \
+_(aten, _nnz) \
+_(aten, _pack_padded_sequence) \
+_(aten, _pack_padded_sequence_backward) \
+_(aten, _pad_circular) \
+_(aten, _pad_enum) \
+_(aten, _pad_packed_sequence) \
+_(aten, _padded_dense_to_jagged_forward) \
+_(aten, _pdist_backward) \
+_(aten, _pdist_forward) \
+_(aten, _pin_memory) \
+_(aten, _prelu_kernel) \
+_(aten, _prelu_kernel_backward) \
+_(aten, _print) \
+_(aten, _propagate_xla_data) \
+_(aten, _remove_batch_dim) \
+_(aten, _reshape_alias) \
+_(aten, _reshape_alias_copy) \
+_(aten, _reshape_copy) \
+_(aten, _reshape_from_tensor) \
+_(aten, _resize_output) \
+_(aten, _resize_output_) \
+_(aten, _rowwise_prune) \
+_(aten, _safe_softmax) \
+_(aten, _sample_dirichlet) \
+_(aten, _saturate_weight_to_fp16) \
+_(aten, _scaled_dot_product_attention_math) \
+_(aten, _scaled_dot_product_attention_math_for_mps) \
+_(aten, _scaled_dot_product_cudnn_attention) \
+_(aten, _scaled_dot_product_cudnn_attention_backward) \
+_(aten, _scaled_dot_product_efficient_attention) \
+_(aten, _scaled_dot_product_efficient_attention_backward) \
+_(aten, _scaled_dot_product_flash_attention) \
+_(aten, _scaled_dot_product_flash_attention_backward) \
+_(aten, _scaled_dot_product_flash_attention_for_cpu) \
+_(aten, _scaled_dot_product_flash_attention_for_cpu_backward) \
+_(aten, _scaled_dot_product_fused_attention_overrideable) \
+_(aten, _scaled_dot_product_fused_attention_overrideable_backward) \
+_(aten, _scaled_grouped_mm) \
+_(aten, _scaled_mm) \
+_(aten, _segment_reduce_backward) \
+_(aten, _shape_as_tensor) \
+_(aten, _slow_conv2d_backward) \
+_(aten, _slow_conv2d_forward) \
+_(aten, _sobol_engine_draw) \
+_(aten, _sobol_engine_ff) \
+_(aten, _sobol_engine_ff_) \
+_(aten, _sobol_engine_initialize_state) \
+_(aten, _sobol_engine_initialize_state_) \
+_(aten, _sobol_engine_scramble) \
+_(aten, _sobol_engine_scramble_) \
+_(aten, _softmax) \
+_(aten, _softmax_backward_data) \
+_(aten, _sparse_addmm) \
+_(aten, _sparse_broadcast_to) \
+_(aten, _sparse_broadcast_to_copy) \
+_(aten, _sparse_bsc_tensor_unsafe) \
+_(aten, _sparse_bsr_tensor_unsafe) \
+_(aten, _sparse_compressed_tensor_unsafe) \
+_(aten, _sparse_compressed_tensor_with_dims) \
+_(aten, _sparse_coo_tensor_unsafe) \
+_(aten, _sparse_coo_tensor_with_dims) \
+_(aten, _sparse_coo_tensor_with_dims_and_tensors) \
+_(aten, _sparse_csc_tensor_unsafe) \
+_(aten, _sparse_csr_prod) \
+_(aten, _sparse_csr_sum) \
+_(aten, _sparse_csr_tensor_unsafe) \
+_(aten, _sparse_log_softmax) \
+_(aten, _sparse_log_softmax_backward_data) \
+_(aten, _sparse_mask_projection) \
+_(aten, _sparse_mm) \
+_(aten, _sparse_mm_reduce_impl) \
+_(aten, _sparse_mm_reduce_impl_backward) \
+_(aten, _sparse_semi_structured_addmm) \
+_(aten, _sparse_semi_structured_apply) \
+_(aten, _sparse_semi_structured_apply_dense) \
+_(aten, _sparse_semi_structured_linear) \
+_(aten, _sparse_semi_structured_mm) \
+_(aten, _sparse_semi_structured_tile) \
+_(aten, _sparse_softmax) \
+_(aten, _sparse_softmax_backward_data) \
+_(aten, _sparse_sparse_matmul) \
+_(aten, _sparse_sum) \
+_(aten, _sparse_sum_backward) \
+_(aten, _spdiags) \
+_(aten, _spsolve) \
+_(aten, _stack) \
+_(aten, _standard_gamma) \
+_(aten, _standard_gamma_grad) \
+_(aten, _test_ambiguous_defaults) \
+_(aten, _test_autograd_multiple_dispatch) \
+_(aten, _test_autograd_multiple_dispatch_view) \
+_(aten, _test_autograd_multiple_dispatch_view_copy) \
+_(aten, _test_check_tensor) \
+_(aten, _test_functorch_fallback) \
+_(aten, _test_optional_filled_intlist) \
+_(aten, _test_optional_floatlist) \
+_(aten, _test_optional_intlist) \
+_(aten, _test_parallel_materialize) \
+_(aten, _test_serialization_subcmul) \
+_(aten, _test_string_default) \
+_(aten, _test_warn_in_autograd) \
+_(aten, _thnn_differentiable_gru_cell_backward) \
+_(aten, _thnn_differentiable_lstm_cell_backward) \
+_(aten, _thnn_fused_gru_cell) \
+_(aten, _thnn_fused_gru_cell_backward) \
+_(aten, _thnn_fused_lstm_cell) \
+_(aten, _thnn_fused_lstm_cell_backward) \
+_(aten, _thnn_fused_lstm_cell_backward_impl) \
+_(aten, _to_copy) \
+_(aten, _to_cpu) \
+_(aten, _to_dense) \
+_(aten, _to_sparse) \
+_(aten, _to_sparse_bsc) \
+_(aten, _to_sparse_bsr) \
+_(aten, _to_sparse_csc) \
+_(aten, _to_sparse_csr) \
+_(aten, _to_sparse_semi_structured) \
+_(aten, _transform_bias_rescale_qkv) \
+_(aten, _transformer_encoder_layer_fwd) \
+_(aten, _trilinear) \
+_(aten, _triton_multi_head_attention) \
+_(aten, _triton_scaled_dot_attention) \
+_(aten, _unique) \
+_(aten, _unique2) \
+_(aten, _unpack_dual) \
+_(aten, _unsafe_index) \
+_(aten, _unsafe_index_put) \
+_(aten, _unsafe_masked_index) \
+_(aten, _unsafe_masked_index_put_accumulate) \
+_(aten, _unsafe_view) \
+_(aten, _upsample_bicubic2d_aa) \
+_(aten, _upsample_bicubic2d_aa_backward) \
+_(aten, _upsample_bilinear2d_aa) \
+_(aten, _upsample_bilinear2d_aa_backward) \
+_(aten, _upsample_nearest_exact1d) \
+_(aten, _upsample_nearest_exact1d_backward) \
+_(aten, _upsample_nearest_exact2d) \
+_(aten, _upsample_nearest_exact2d_backward) \
+_(aten, _upsample_nearest_exact3d) \
+_(aten, _upsample_nearest_exact3d_backward) \
+_(aten, _use_cudnn_ctc_loss) \
+_(aten, _use_cudnn_rnn_flatten_weight) \
+_(aten, _validate_compressed_sparse_indices) \
+_(aten, _validate_sparse_bsc_tensor_args) \
+_(aten, _validate_sparse_bsr_tensor_args) \
+_(aten, _validate_sparse_compressed_tensor_args) \
+_(aten, _validate_sparse_coo_tensor_args) \
+_(aten, _validate_sparse_csc_tensor_args) \
+_(aten, _validate_sparse_csr_tensor_args) \
+_(aten, _values) \
+_(aten, _values_copy) \
+_(aten, _version) \
+_(aten, _weight_int4pack_mm) \
+_(aten, _weight_int4pack_mm_for_cpu) \
+_(aten, _weight_int4pack_mm_with_scales_and_zeros) \
+_(aten, _weight_int8pack_mm) \
+_(aten, _weight_norm) \
+_(aten, _weight_norm_differentiable_backward) \
+_(aten, _weight_norm_interface) \
+_(aten, _weight_norm_interface_backward) \
+_(aten, _wrapped_linear_prepack) \
+_(aten, _wrapped_quantized_linear_prepacked) \
+_(aten, abs) \
+_(aten, abs_) \
+_(aten, absolute) \
+_(aten, absolute_) \
+_(aten, acos) \
+_(aten, acos_) \
+_(aten, acosh) \
+_(aten, acosh_) \
+_(aten, adaptive_avg_pool1d) \
+_(aten, adaptive_avg_pool2d) \
+_(aten, adaptive_avg_pool3d) \
+_(aten, adaptive_avg_pool3d_backward) \
+_(aten, adaptive_max_pool1d) \
+_(aten, adaptive_max_pool2d) \
+_(aten, adaptive_max_pool2d_backward) \
+_(aten, adaptive_max_pool3d) \
+_(aten, adaptive_max_pool3d_backward) \
+_(aten, add) \
+_(aten, add_) \
+_(aten, addbmm) \
+_(aten, addbmm_) \
+_(aten, addcdiv) \
+_(aten, addcdiv_) \
+_(aten, addcmul) \
+_(aten, addcmul_) \
+_(aten, addmm) \
+_(aten, addmm_) \
+_(aten, addmv) \
+_(aten, addmv_) \
+_(aten, addr) \
+_(aten, addr_) \
+_(aten, adjoint) \
+_(aten, affine_grid_generator) \
+_(aten, affine_grid_generator_backward) \
+_(aten, alias) \
+_(aten, alias_copy) \
+_(aten, align_as) \
+_(aten, align_tensors) \
+_(aten, align_to) \
+_(aten, all) \
+_(aten, allclose) \
+_(aten, alpha_dropout) \
+_(aten, alpha_dropout_) \
+_(aten, amax) \
+_(aten, amin) \
+_(aten, aminmax) \
+_(aten, angle) \
+_(aten, any) \
+_(aten, arange) \
+_(aten, arccos) \
+_(aten, arccos_) \
+_(aten, arccosh) \
+_(aten, arccosh_) \
+_(aten, arcsin) \
+_(aten, arcsin_) \
+_(aten, arcsinh) \
+_(aten, arcsinh_) \
+_(aten, arctan) \
+_(aten, arctan2) \
+_(aten, arctan2_) \
+_(aten, arctan_) \
+_(aten, arctanh) \
+_(aten, arctanh_) \
+_(aten, argmax) \
+_(aten, argmin) \
+_(aten, argsort) \
+_(aten, argwhere) \
+_(aten, as_strided) \
+_(aten, as_strided_) \
+_(aten, as_strided_copy) \
+_(aten, as_strided_scatter) \
+_(aten, asin) \
+_(aten, asin_) \
+_(aten, asinh) \
+_(aten, asinh_) \
+_(aten, atan) \
+_(aten, atan2) \
+_(aten, atan2_) \
+_(aten, atan_) \
+_(aten, atanh) \
+_(aten, atanh_) \
+_(aten, atleast_1d) \
+_(aten, atleast_2d) \
+_(aten, atleast_3d) \
+_(aten, avg_pool1d) \
+_(aten, avg_pool2d) \
+_(aten, avg_pool2d_backward) \
+_(aten, avg_pool3d) \
+_(aten, avg_pool3d_backward) \
+_(aten, baddbmm) \
+_(aten, baddbmm_) \
+_(aten, bartlett_window) \
+_(aten, batch_norm) \
+_(aten, batch_norm_backward) \
+_(aten, batch_norm_backward_elemt) \
+_(aten, batch_norm_backward_reduce) \
+_(aten, batch_norm_elemt) \
+_(aten, batch_norm_gather_stats) \
+_(aten, batch_norm_gather_stats_with_counts) \
+_(aten, batch_norm_stats) \
+_(aten, batch_norm_update_stats) \
+_(aten, bernoulli) \
+_(aten, bernoulli_) \
+_(aten, bilinear) \
+_(aten, binary_cross_entropy) \
+_(aten, binary_cross_entropy_backward) \
+_(aten, binary_cross_entropy_with_logits) \
+_(aten, bincount) \
+_(aten, binomial) \
+_(aten, bitwise_and) \
+_(aten, bitwise_and_) \
+_(aten, bitwise_left_shift) \
+_(aten, bitwise_left_shift_) \
+_(aten, bitwise_not) \
+_(aten, bitwise_not_) \
+_(aten, bitwise_or) \
+_(aten, bitwise_or_) \
+_(aten, bitwise_right_shift) \
+_(aten, bitwise_right_shift_) \
+_(aten, bitwise_xor) \
+_(aten, bitwise_xor_) \
+_(aten, blackman_window) \
+_(aten, block_diag) \
+_(aten, bmm) \
+_(aten, broadcast_tensors) \
+_(aten, broadcast_to) \
+_(aten, bucketize) \
+_(aten, can_cast) \
+_(aten, cartesian_prod) \
+_(aten, cat) \
+_(aten, cauchy) \
+_(aten, cauchy_) \
+_(aten, ccol_indices) \
+_(aten, ccol_indices_copy) \
+_(aten, cdist) \
+_(aten, ceil) \
+_(aten, ceil_) \
+_(aten, celu) \
+_(aten, celu_) \
+_(aten, chain_matmul) \
+_(aten, chalf) \
+_(aten, channel_shuffle) \
+_(aten, cholesky) \
+_(aten, cholesky_inverse) \
+_(aten, cholesky_solve) \
+_(aten, choose_qparams_optimized) \
+_(aten, chunk) \
+_(aten, clamp) \
+_(aten, clamp_) \
+_(aten, clamp_max) \
+_(aten, clamp_max_) \
+_(aten, clamp_min) \
+_(aten, clamp_min_) \
+_(aten, clip) \
+_(aten, clip_) \
+_(aten, clone) \
+_(aten, coalesce) \
+_(aten, col2im) \
+_(aten, col_indices) \
+_(aten, col_indices_copy) \
+_(aten, column_stack) \
+_(aten, combinations) \
+_(aten, complex) \
+_(aten, concat) \
+_(aten, concatenate) \
+_(aten, conj) \
+_(aten, conj_physical) \
+_(aten, conj_physical_) \
+_(aten, constant_pad_nd) \
+_(aten, contiguous) \
+_(aten, conv1d) \
+_(aten, conv2d) \
+_(aten, conv3d) \
+_(aten, conv_depthwise3d) \
+_(aten, conv_tbc) \
+_(aten, conv_tbc_backward) \
+_(aten, conv_transpose1d) \
+_(aten, conv_transpose2d) \
+_(aten, conv_transpose3d) \
+_(aten, convolution) \
+_(aten, convolution_backward) \
+_(aten, convolution_backward_overrideable) \
+_(aten, convolution_overrideable) \
+_(aten, copy) \
+_(aten, copy_) \
+_(aten, copy_sparse_to_sparse) \
+_(aten, copy_sparse_to_sparse_) \
+_(aten, copysign) \
+_(aten, copysign_) \
+_(aten, corrcoef) \
+_(aten, cos) \
+_(aten, cos_) \
+_(aten, cosh) \
+_(aten, cosh_) \
+_(aten, cosine_embedding_loss) \
+_(aten, cosine_similarity) \
+_(aten, count_nonzero) \
+_(aten, cov) \
+_(aten, cross) \
+_(aten, cross_entropy_loss) \
+_(aten, crow_indices) \
+_(aten, crow_indices_copy) \
+_(aten, ctc_loss) \
+_(aten, cudnn_affine_grid_generator) \
+_(aten, cudnn_affine_grid_generator_backward) \
+_(aten, cudnn_batch_norm) \
+_(aten, cudnn_batch_norm_backward) \
+_(aten, cudnn_convolution) \
+_(aten, cudnn_convolution_add_relu) \
+_(aten, cudnn_convolution_relu) \
+_(aten, cudnn_convolution_transpose) \
+_(aten, cudnn_grid_sampler) \
+_(aten, cudnn_grid_sampler_backward) \
+_(aten, cudnn_is_acceptable) \
+_(aten, cummax) \
+_(aten, cummaxmin_backward) \
+_(aten, cummin) \
+_(aten, cumprod) \
+_(aten, cumprod_) \
+_(aten, cumprod_backward) \
+_(aten, cumsum) \
+_(aten, cumsum_) \
+_(aten, cumulative_trapezoid) \
+_(aten, data) \
+_(aten, deg2rad) \
+_(aten, deg2rad_) \
+_(aten, dense_dim) \
+_(aten, dequantize) \
+_(aten, det) \
+_(aten, detach) \
+_(aten, detach_) \
+_(aten, detach_copy) \
+_(aten, diag) \
+_(aten, diag_embed) \
+_(aten, diagflat) \
+_(aten, diagonal) \
+_(aten, diagonal_backward) \
+_(aten, diagonal_copy) \
+_(aten, diagonal_scatter) \
+_(aten, diff) \
+_(aten, digamma) \
+_(aten, digamma_) \
+_(aten, dist) \
+_(aten, div) \
+_(aten, div_) \
+_(aten, divide) \
+_(aten, divide_) \
+_(aten, dot) \
+_(aten, dropout) \
+_(aten, dropout_) \
+_(aten, dsplit) \
+_(aten, dstack) \
+_(aten, einsum) \
+_(aten, elu) \
+_(aten, elu_) \
+_(aten, elu_backward) \
+_(aten, embedding) \
+_(aten, embedding_backward) \
+_(aten, embedding_bag) \
+_(aten, embedding_dense_backward) \
+_(aten, embedding_renorm) \
+_(aten, embedding_renorm_) \
+_(aten, embedding_sparse_backward) \
+_(aten, empty) \
+_(aten, empty_like) \
+_(aten, empty_permuted) \
+_(aten, empty_quantized) \
+_(aten, empty_strided) \
+_(aten, eq) \
+_(aten, eq_) \
+_(aten, equal) \
+_(aten, erf) \
+_(aten, erf_) \
+_(aten, erfc) \
+_(aten, erfc_) \
+_(aten, erfinv) \
+_(aten, erfinv_) \
+_(aten, exp) \
+_(aten, exp2) \
+_(aten, exp2_) \
+_(aten, exp_) \
+_(aten, expand) \
+_(aten, expand_as) \
+_(aten, expand_copy) \
+_(aten, expm1) \
+_(aten, expm1_) \
+_(aten, exponential) \
+_(aten, exponential_) \
+_(aten, eye) \
+_(aten, fake_quantize_per_channel_affine) \
+_(aten, fake_quantize_per_channel_affine_cachemask) \
+_(aten, fake_quantize_per_channel_affine_cachemask_backward) \
+_(aten, fake_quantize_per_tensor_affine) \
+_(aten, fake_quantize_per_tensor_affine_cachemask) \
+_(aten, fake_quantize_per_tensor_affine_cachemask_backward) \
+_(aten, fbgemm_linear_fp16_weight) \
+_(aten, fbgemm_linear_fp16_weight_fp32_activation) \
+_(aten, fbgemm_linear_int8_weight) \
+_(aten, fbgemm_linear_int8_weight_fp32_activation) \
+_(aten, fbgemm_linear_quantize_weight) \
+_(aten, fbgemm_pack_gemm_matrix_fp16) \
+_(aten, fbgemm_pack_quantized_matrix) \
+_(aten, feature_alpha_dropout) \
+_(aten, feature_alpha_dropout_) \
+_(aten, feature_dropout) \
+_(aten, feature_dropout_) \
+_(aten, fft_fft) \
+_(aten, fft_fft2) \
+_(aten, fft_fftfreq) \
+_(aten, fft_fftn) \
+_(aten, fft_fftshift) \
+_(aten, fft_hfft) \
+_(aten, fft_hfft2) \
+_(aten, fft_hfftn) \
+_(aten, fft_ifft) \
+_(aten, fft_ifft2) \
+_(aten, fft_ifftn) \
+_(aten, fft_ifftshift) \
+_(aten, fft_ihfft) \
+_(aten, fft_ihfft2) \
+_(aten, fft_ihfftn) \
+_(aten, fft_irfft) \
+_(aten, fft_irfft2) \
+_(aten, fft_irfftn) \
+_(aten, fft_rfft) \
+_(aten, fft_rfft2) \
+_(aten, fft_rfftfreq) \
+_(aten, fft_rfftn) \
+_(aten, fill) \
+_(aten, fill_) \
+_(aten, fill_diagonal) \
+_(aten, fill_diagonal_) \
+_(aten, fix) \
+_(aten, fix_) \
+_(aten, flatten) \
+_(aten, flatten_dense_tensors) \
+_(aten, flip) \
+_(aten, fliplr) \
+_(aten, flipud) \
+_(aten, float_power) \
+_(aten, float_power_) \
+_(aten, floor) \
+_(aten, floor_) \
+_(aten, floor_divide) \
+_(aten, floor_divide_) \
+_(aten, fmax) \
+_(aten, fmin) \
+_(aten, fmod) \
+_(aten, fmod_) \
+_(aten, frac) \
+_(aten, frac_) \
+_(aten, fractional_max_pool2d) \
+_(aten, fractional_max_pool2d_backward) \
+_(aten, fractional_max_pool3d) \
+_(aten, fractional_max_pool3d_backward) \
+_(aten, frexp) \
+_(aten, frobenius_norm) \
+_(aten, from_file) \
+_(aten, full) \
+_(aten, full_like) \
+_(aten, fused_moving_avg_obs_fake_quant) \
+_(aten, gather) \
+_(aten, gather_backward) \
+_(aten, gcd) \
+_(aten, gcd_) \
+_(aten, ge) \
+_(aten, ge_) \
+_(aten, gelu) \
+_(aten, gelu_) \
+_(aten, gelu_backward) \
+_(aten, geometric) \
+_(aten, geometric_) \
+_(aten, geqrf) \
+_(aten, ger) \
+_(aten, glu) \
+_(aten, glu_backward) \
+_(aten, glu_backward_jvp) \
+_(aten, glu_jvp) \
+_(aten, gradient) \
+_(aten, greater) \
+_(aten, greater_) \
+_(aten, greater_equal) \
+_(aten, greater_equal_) \
+_(aten, grid_sampler) \
+_(aten, grid_sampler_2d) \
+_(aten, grid_sampler_2d_backward) \
+_(aten, grid_sampler_3d) \
+_(aten, grid_sampler_3d_backward) \
+_(aten, group_norm) \
+_(aten, gru) \
+_(aten, gru_cell) \
+_(aten, gt) \
+_(aten, gt_) \
+_(aten, hamming_window) \
+_(aten, hann_window) \
+_(aten, hardshrink) \
+_(aten, hardshrink_backward) \
+_(aten, hardsigmoid) \
+_(aten, hardsigmoid_) \
+_(aten, hardsigmoid_backward) \
+_(aten, hardswish) \
+_(aten, hardswish_) \
+_(aten, hardswish_backward) \
+_(aten, hardtanh) \
+_(aten, hardtanh_) \
+_(aten, hardtanh_backward) \
+_(aten, heaviside) \
+_(aten, heaviside_) \
+_(aten, hinge_embedding_loss) \
+_(aten, histc) \
+_(aten, histogram) \
+_(aten, histogramdd) \
+_(aten, hsplit) \
+_(aten, hspmm) \
+_(aten, hstack) \
+_(aten, huber_loss) \
+_(aten, huber_loss_backward) \
+_(aten, hypot) \
+_(aten, hypot_) \
+_(aten, i0) \
+_(aten, i0_) \
+_(aten, igamma) \
+_(aten, igamma_) \
+_(aten, igammac) \
+_(aten, igammac_) \
+_(aten, im2col) \
+_(aten, imag) \
+_(aten, index) \
+_(aten, index_add) \
+_(aten, index_add_) \
+_(aten, index_copy) \
+_(aten, index_copy_) \
+_(aten, index_fill) \
+_(aten, index_fill_) \
+_(aten, index_put) \
+_(aten, index_put_) \
+_(aten, index_reduce) \
+_(aten, index_reduce_) \
+_(aten, index_select) \
+_(aten, index_select_backward) \
+_(aten, indices) \
+_(aten, indices_copy) \
+_(aten, infinitely_differentiable_gelu_backward) \
+_(aten, inner) \
+_(aten, instance_norm) \
+_(aten, int_repr) \
+_(aten, inverse) \
+_(aten, is_coalesced) \
+_(aten, is_complex) \
+_(aten, is_conj) \
+_(aten, is_distributed) \
+_(aten, is_floating_point) \
+_(aten, is_inference) \
+_(aten, is_leaf) \
+_(aten, is_neg) \
+_(aten, is_nonzero) \
+_(aten, is_pinned) \
+_(aten, is_same_size) \
+_(aten, is_set_to) \
+_(aten, is_signed) \
+_(aten, is_vulkan_available) \
+_(aten, isclose) \
+_(aten, isfinite) \
+_(aten, isin) \
+_(aten, isinf) \
+_(aten, isnan) \
+_(aten, isneginf) \
+_(aten, isposinf) \
+_(aten, isreal) \
+_(aten, istft) \
+_(aten, item) \
+_(aten, kaiser_window) \
+_(aten, kl_div) \
+_(aten, kron) \
+_(aten, kthvalue) \
+_(aten, l1_loss) \
+_(aten, layer_norm) \
+_(aten, lcm) \
+_(aten, lcm_) \
+_(aten, ldexp) \
+_(aten, ldexp_) \
+_(aten, le) \
+_(aten, le_) \
+_(aten, leaky_relu) \
+_(aten, leaky_relu_) \
+_(aten, leaky_relu_backward) \
+_(aten, lerp) \
+_(aten, lerp_) \
+_(aten, less) \
+_(aten, less_) \
+_(aten, less_equal) \
+_(aten, less_equal_) \
+_(aten, lgamma) \
+_(aten, lgamma_) \
+_(aten, lift) \
+_(aten, lift_fresh) \
+_(aten, lift_fresh_copy) \
+_(aten, linalg_cholesky) \
+_(aten, linalg_cholesky_ex) \
+_(aten, linalg_cond) \
+_(aten, linalg_cross) \
+_(aten, linalg_det) \
+_(aten, linalg_diagonal) \
+_(aten, linalg_eig) \
+_(aten, linalg_eigh) \
+_(aten, linalg_eigvals) \
+_(aten, linalg_eigvalsh) \
+_(aten, linalg_householder_product) \
+_(aten, linalg_inv) \
+_(aten, linalg_inv_ex) \
+_(aten, linalg_ldl_factor) \
+_(aten, linalg_ldl_factor_ex) \
+_(aten, linalg_ldl_solve) \
+_(aten, linalg_lstsq) \
+_(aten, linalg_lu) \
+_(aten, linalg_lu_factor) \
+_(aten, linalg_lu_factor_ex) \
+_(aten, linalg_lu_solve) \
+_(aten, linalg_matmul) \
+_(aten, linalg_matrix_exp) \
+_(aten, linalg_matrix_norm) \
+_(aten, linalg_matrix_power) \
+_(aten, linalg_matrix_rank) \
+_(aten, linalg_multi_dot) \
+_(aten, linalg_norm) \
+_(aten, linalg_pinv) \
+_(aten, linalg_qr) \
+_(aten, linalg_slogdet) \
+_(aten, linalg_solve) \
+_(aten, linalg_solve_ex) \
+_(aten, linalg_solve_triangular) \
+_(aten, linalg_svd) \
+_(aten, linalg_svdvals) \
+_(aten, linalg_tensorinv) \
+_(aten, linalg_tensorsolve) \
+_(aten, linalg_vander) \
+_(aten, linalg_vecdot) \
+_(aten, linalg_vector_norm) \
+_(aten, linear) \
+_(aten, linear_backward) \
+_(aten, linspace) \
+_(aten, log) \
+_(aten, log10) \
+_(aten, log10_) \
+_(aten, log1p) \
+_(aten, log1p_) \
+_(aten, log2) \
+_(aten, log2_) \
+_(aten, log_) \
+_(aten, log_normal) \
+_(aten, log_normal_) \
+_(aten, log_sigmoid) \
+_(aten, log_sigmoid_backward) \
+_(aten, log_sigmoid_forward) \
+_(aten, log_softmax) \
+_(aten, logaddexp) \
+_(aten, logaddexp2) \
+_(aten, logcumsumexp) \
+_(aten, logdet) \
+_(aten, logical_and) \
+_(aten, logical_and_) \
+_(aten, logical_not) \
+_(aten, logical_not_) \
+_(aten, logical_or) \
+_(aten, logical_or_) \
+_(aten, logical_xor) \
+_(aten, logical_xor_) \
+_(aten, logit) \
+_(aten, logit_) \
+_(aten, logit_backward) \
+_(aten, logspace) \
+_(aten, logsumexp) \
+_(aten, lshift) \
+_(aten, lstm) \
+_(aten, lstm_cell) \
+_(aten, lstm_mps_backward) \
+_(aten, lt) \
+_(aten, lt_) \
+_(aten, lu_solve) \
+_(aten, lu_unpack) \
+_(aten, mH) \
+_(aten, mT) \
+_(aten, margin_ranking_loss) \
+_(aten, masked_fill) \
+_(aten, masked_fill_) \
+_(aten, masked_scatter) \
+_(aten, masked_scatter_) \
+_(aten, masked_scatter_backward) \
+_(aten, masked_select) \
+_(aten, masked_select_backward) \
+_(aten, matmul) \
+_(aten, matmul_backward) \
+_(aten, matrix_H) \
+_(aten, matrix_exp) \
+_(aten, matrix_exp_backward) \
+_(aten, matrix_power) \
+_(aten, max) \
+_(aten, max_pool1d) \
+_(aten, max_pool1d_with_indices) \
+_(aten, max_pool2d) \
+_(aten, max_pool2d_backward) \
+_(aten, max_pool2d_with_indices) \
+_(aten, max_pool2d_with_indices_backward) \
+_(aten, max_pool3d) \
+_(aten, max_pool3d_with_indices) \
+_(aten, max_pool3d_with_indices_backward) \
+_(aten, max_unpool2d) \
+_(aten, max_unpool3d) \
+_(aten, maximum) \
+_(aten, mean) \
+_(aten, median) \
+_(aten, meshgrid) \
+_(aten, min) \
+_(aten, minimum) \
+_(aten, miopen_batch_norm) \
+_(aten, miopen_batch_norm_backward) \
+_(aten, miopen_convolution) \
+_(aten, miopen_convolution_add_relu) \
+_(aten, miopen_convolution_relu) \
+_(aten, miopen_convolution_transpose) \
+_(aten, miopen_depthwise_convolution) \
+_(aten, miopen_rnn) \
+_(aten, miopen_rnn_backward) \
+_(aten, mish) \
+_(aten, mish_) \
+_(aten, mish_backward) \
+_(aten, mkldnn_adaptive_avg_pool2d) \
+_(aten, mkldnn_adaptive_avg_pool2d_backward) \
+_(aten, mkldnn_convolution) \
+_(aten, mkldnn_linear) \
+_(aten, mkldnn_linear_backward) \
+_(aten, mkldnn_linear_backward_input) \
+_(aten, mkldnn_linear_backward_weights) \
+_(aten, mkldnn_max_pool2d) \
+_(aten, mkldnn_max_pool2d_backward) \
+_(aten, mkldnn_max_pool3d) \
+_(aten, mkldnn_max_pool3d_backward) \
+_(aten, mkldnn_reorder_conv2d_weight) \
+_(aten, mkldnn_reorder_conv3d_weight) \
+_(aten, mkldnn_rnn_layer) \
+_(aten, mkldnn_rnn_layer_backward) \
+_(aten, mm) \
+_(aten, mode) \
+_(aten, moveaxis) \
+_(aten, movedim) \
+_(aten, mps_convolution_backward) \
+_(aten, mps_convolution_transpose_backward) \
+_(aten, mse_loss) \
+_(aten, mse_loss_backward) \
+_(aten, msort) \
+_(aten, mul) \
+_(aten, mul_) \
+_(aten, multi_margin_loss) \
+_(aten, multi_margin_loss_backward) \
+_(aten, multilabel_margin_loss) \
+_(aten, multilabel_margin_loss_backward) \
+_(aten, multilabel_margin_loss_forward) \
+_(aten, multinomial) \
+_(aten, multiply) \
+_(aten, multiply_) \
+_(aten, mv) \
+_(aten, mvlgamma) \
+_(aten, mvlgamma_) \
+_(aten, nan_to_num) \
+_(aten, nan_to_num_) \
+_(aten, nanmean) \
+_(aten, nanmedian) \
+_(aten, nanquantile) \
+_(aten, nansum) \
+_(aten, narrow) \
+_(aten, narrow_copy) \
+_(aten, native_batch_norm) \
+_(aten, native_batch_norm_backward) \
+_(aten, native_channel_shuffle) \
+_(aten, native_dropout) \
+_(aten, native_dropout_backward) \
+_(aten, native_group_norm) \
+_(aten, native_group_norm_backward) \
+_(aten, native_layer_norm) \
+_(aten, native_layer_norm_backward) \
+_(aten, native_norm) \
+_(aten, ne) \
+_(aten, ne_) \
+_(aten, neg) \
+_(aten, neg_) \
+_(aten, negative) \
+_(aten, negative_) \
+_(aten, nested_to_padded_tensor) \
+_(aten, new_empty) \
+_(aten, new_empty_strided) \
+_(aten, new_full) \
+_(aten, new_ones) \
+_(aten, new_zeros) \
+_(aten, nextafter) \
+_(aten, nextafter_) \
+_(aten, nll_loss) \
+_(aten, nll_loss2d) \
+_(aten, nll_loss2d_backward) \
+_(aten, nll_loss2d_forward) \
+_(aten, nll_loss_backward) \
+_(aten, nll_loss_forward) \
+_(aten, nll_loss_nd) \
+_(aten, nonzero) \
+_(aten, nonzero_numpy) \
+_(aten, nonzero_static) \
+_(aten, norm) \
+_(aten, norm_except_dim) \
+_(aten, normal) \
+_(aten, normal_) \
+_(aten, normal_functional) \
+_(aten, not_equal) \
+_(aten, not_equal_) \
+_(aten, nuclear_norm) \
+_(aten, numpy_T) \
+_(aten, one_hot) \
+_(aten, ones) \
+_(aten, ones_like) \
+_(aten, orgqr) \
+_(aten, ormqr) \
+_(aten, outer) \
+_(aten, output_nr) \
+_(aten, pad) \
+_(aten, pad_sequence) \
+_(aten, pairwise_distance) \
+_(aten, pdist) \
+_(aten, permute) \
+_(aten, permute_copy) \
+_(aten, pin_memory) \
+_(aten, pinverse) \
+_(aten, pixel_shuffle) \
+_(aten, pixel_unshuffle) \
+_(aten, poisson) \
+_(aten, poisson_nll_loss) \
+_(aten, polar) \
+_(aten, polygamma) \
+_(aten, polygamma_) \
+_(aten, positive) \
+_(aten, pow) \
+_(aten, pow_) \
+_(aten, prelu) \
+_(aten, prod) \
+_(aten, promote_types) \
+_(aten, put) \
+_(aten, put_) \
+_(aten, q_per_channel_axis) \
+_(aten, q_per_channel_scales) \
+_(aten, q_per_channel_zero_points) \
+_(aten, q_scale) \
+_(aten, q_zero_point) \
+_(aten, qr) \
+_(aten, qscheme) \
+_(aten, quantile) \
+_(aten, quantize_per_channel) \
+_(aten, quantize_per_tensor) \
+_(aten, quantize_per_tensor_dynamic) \
+_(aten, quantized_batch_norm) \
+_(aten, quantized_gru_cell) \
+_(aten, quantized_lstm_cell) \
+_(aten, quantized_max_pool1d) \
+_(aten, quantized_max_pool2d) \
+_(aten, quantized_max_pool3d) \
+_(aten, quantized_rnn_relu_cell) \
+_(aten, quantized_rnn_tanh_cell) \
+_(aten, rad2deg) \
+_(aten, rad2deg_) \
+_(aten, rand) \
+_(aten, rand_like) \
+_(aten, randint) \
+_(aten, randint_like) \
+_(aten, randn) \
+_(aten, randn_like) \
+_(aten, random) \
+_(aten, random_) \
+_(aten, randperm) \
+_(aten, range) \
+_(aten, ravel) \
+_(aten, real) \
+_(aten, reciprocal) \
+_(aten, reciprocal_) \
+_(aten, record_stream) \
+_(aten, refine_names) \
+_(aten, reflection_pad1d) \
+_(aten, reflection_pad1d_backward) \
+_(aten, reflection_pad2d) \
+_(aten, reflection_pad2d_backward) \
+_(aten, reflection_pad3d) \
+_(aten, reflection_pad3d_backward) \
+_(aten, relu) \
+_(aten, relu6) \
+_(aten, relu6_) \
+_(aten, relu_) \
+_(aten, remainder) \
+_(aten, remainder_) \
+_(aten, rename) \
+_(aten, rename_) \
+_(aten, renorm) \
+_(aten, renorm_) \
+_(aten, repeat) \
+_(aten, repeat_interleave) \
+_(aten, replication_pad1d) \
+_(aten, replication_pad1d_backward) \
+_(aten, replication_pad2d) \
+_(aten, replication_pad2d_backward) \
+_(aten, replication_pad3d) \
+_(aten, replication_pad3d_backward) \
+_(aten, requires_grad) \
+_(aten, requires_grad_) \
+_(aten, reshape) \
+_(aten, reshape_as) \
+_(aten, resize) \
+_(aten, resize_) \
+_(aten, resize_as) \
+_(aten, resize_as_) \
+_(aten, resize_as_sparse) \
+_(aten, resize_as_sparse_) \
+_(aten, resolve_conj) \
+_(aten, resolve_neg) \
+_(aten, result_type) \
+_(aten, retain_grad) \
+_(aten, retains_grad) \
+_(aten, rms_norm) \
+_(aten, rnn_relu) \
+_(aten, rnn_relu_cell) \
+_(aten, rnn_tanh) \
+_(aten, rnn_tanh_cell) \
+_(aten, roll) \
+_(aten, rot90) \
+_(aten, round) \
+_(aten, round_) \
+_(aten, row_indices) \
+_(aten, row_indices_copy) \
+_(aten, row_stack) \
+_(aten, rrelu) \
+_(aten, rrelu_) \
+_(aten, rrelu_with_noise) \
+_(aten, rrelu_with_noise_) \
+_(aten, rrelu_with_noise_backward) \
+_(aten, rrelu_with_noise_functional) \
+_(aten, rshift) \
+_(aten, rsqrt) \
+_(aten, rsqrt_) \
+_(aten, rsub) \
+_(aten, scalar_tensor) \
+_(aten, scaled_dot_product_attention) \
+_(aten, scatter) \
+_(aten, scatter_) \
+_(aten, scatter_add) \
+_(aten, scatter_add_) \
+_(aten, scatter_reduce) \
+_(aten, scatter_reduce_) \
+_(aten, searchsorted) \
+_(aten, segment_reduce) \
+_(aten, select) \
+_(aten, select_backward) \
+_(aten, select_copy) \
+_(aten, select_scatter) \
+_(aten, selu) \
+_(aten, selu_) \
+_(aten, set) \
+_(aten, set_) \
+_(aten, set_data) \
+_(aten, sgn) \
+_(aten, sgn_) \
+_(aten, sigmoid) \
+_(aten, sigmoid_) \
+_(aten, sigmoid_backward) \
+_(aten, sign) \
+_(aten, sign_) \
+_(aten, signbit) \
+_(aten, silu) \
+_(aten, silu_) \
+_(aten, silu_backward) \
+_(aten, sin) \
+_(aten, sin_) \
+_(aten, sinc) \
+_(aten, sinc_) \
+_(aten, sinh) \
+_(aten, sinh_) \
+_(aten, size) \
+_(aten, slice) \
+_(aten, slice_backward) \
+_(aten, slice_copy) \
+_(aten, slice_inverse) \
+_(aten, slice_scatter) \
+_(aten, slogdet) \
+_(aten, slow_conv3d) \
+_(aten, slow_conv3d_forward) \
+_(aten, slow_conv_dilated2d) \
+_(aten, slow_conv_dilated3d) \
+_(aten, slow_conv_transpose2d) \
+_(aten, slow_conv_transpose3d) \
+_(aten, smm) \
+_(aten, smooth_l1_loss) \
+_(aten, smooth_l1_loss_backward) \
+_(aten, soft_margin_loss) \
+_(aten, soft_margin_loss_backward) \
+_(aten, softmax) \
+_(aten, softplus) \
+_(aten, softplus_backward) \
+_(aten, softshrink) \
+_(aten, softshrink_backward) \
+_(aten, sort) \
+_(aten, sparse_bsc_tensor) \
+_(aten, sparse_bsr_tensor) \
+_(aten, sparse_compressed_tensor) \
+_(aten, sparse_coo_tensor) \
+_(aten, sparse_csc_tensor) \
+_(aten, sparse_csr_tensor) \
+_(aten, sparse_dim) \
+_(aten, sparse_mask) \
+_(aten, sparse_resize) \
+_(aten, sparse_resize_) \
+_(aten, sparse_resize_and_clear) \
+_(aten, sparse_resize_and_clear_) \
+_(aten, sparse_sampled_addmm) \
+_(aten, special_airy_ai) \
+_(aten, special_bessel_j0) \
+_(aten, special_bessel_j1) \
+_(aten, special_bessel_y0) \
+_(aten, special_bessel_y1) \
+_(aten, special_chebyshev_polynomial_t) \
+_(aten, special_chebyshev_polynomial_u) \
+_(aten, special_chebyshev_polynomial_v) \
+_(aten, special_chebyshev_polynomial_w) \
+_(aten, special_digamma) \
+_(aten, special_entr) \
+_(aten, special_erf) \
+_(aten, special_erfc) \
+_(aten, special_erfcx) \
+_(aten, special_erfinv) \
+_(aten, special_exp2) \
+_(aten, special_expit) \
+_(aten, special_expm1) \
+_(aten, special_gammainc) \
+_(aten, special_gammaincc) \
+_(aten, special_gammaln) \
+_(aten, special_hermite_polynomial_h) \
+_(aten, special_hermite_polynomial_he) \
+_(aten, special_i0) \
+_(aten, special_i0e) \
+_(aten, special_i1) \
+_(aten, special_i1e) \
+_(aten, special_laguerre_polynomial_l) \
+_(aten, special_legendre_polynomial_p) \
+_(aten, special_log1p) \
+_(aten, special_log_ndtr) \
+_(aten, special_log_softmax) \
+_(aten, special_logit) \
+_(aten, special_logsumexp) \
+_(aten, special_modified_bessel_i0) \
+_(aten, special_modified_bessel_i1) \
+_(aten, special_modified_bessel_k0) \
+_(aten, special_modified_bessel_k1) \
+_(aten, special_multigammaln) \
+_(aten, special_ndtr) \
+_(aten, special_ndtri) \
+_(aten, special_polygamma) \
+_(aten, special_psi) \
+_(aten, special_round) \
+_(aten, special_scaled_modified_bessel_k0) \
+_(aten, special_scaled_modified_bessel_k1) \
+_(aten, special_shifted_chebyshev_polynomial_t) \
+_(aten, special_shifted_chebyshev_polynomial_u) \
+_(aten, special_shifted_chebyshev_polynomial_v) \
+_(aten, special_shifted_chebyshev_polynomial_w) \
+_(aten, special_sinc) \
+_(aten, special_softmax) \
+_(aten, special_spherical_bessel_j0) \
+_(aten, special_xlog1py) \
+_(aten, special_xlogy) \
+_(aten, special_zeta) \
+_(aten, split) \
+_(aten, split_copy) \
+_(aten, split_with_sizes) \
+_(aten, split_with_sizes_copy) \
+_(aten, sqrt) \
+_(aten, sqrt_) \
+_(aten, square) \
+_(aten, square_) \
+_(aten, squeeze) \
+_(aten, squeeze_) \
+_(aten, squeeze_copy) \
+_(aten, sspaddmm) \
+_(aten, stack) \
+_(aten, std) \
+_(aten, std_mean) \
+_(aten, stft) \
+_(aten, stride) \
+_(aten, sub) \
+_(aten, sub_) \
+_(aten, subtract) \
+_(aten, subtract_) \
+_(aten, sum) \
+_(aten, sum_to_size) \
+_(aten, svd) \
+_(aten, swapaxes) \
+_(aten, swapaxes_) \
+_(aten, swapdims) \
+_(aten, swapdims_) \
+_(aten, sym_constrain_range) \
+_(aten, sym_constrain_range_for_size) \
+_(aten, sym_numel) \
+_(aten, sym_size) \
+_(aten, sym_storage_offset) \
+_(aten, sym_stride) \
+_(aten, t) \
+_(aten, t_) \
+_(aten, t_copy) \
+_(aten, take) \
+_(aten, take_along_dim) \
+_(aten, tan) \
+_(aten, tan_) \
+_(aten, tanh) \
+_(aten, tanh_) \
+_(aten, tanh_backward) \
+_(aten, tensor_split) \
+_(aten, tensordot) \
+_(aten, thnn_conv2d) \
+_(aten, threshold) \
+_(aten, threshold_) \
+_(aten, threshold_backward) \
+_(aten, tile) \
+_(aten, to) \
+_(aten, to_dense) \
+_(aten, to_dense_backward) \
+_(aten, to_mkldnn) \
+_(aten, to_mkldnn_backward) \
+_(aten, to_padded_tensor) \
+_(aten, to_sparse) \
+_(aten, to_sparse_bsc) \
+_(aten, to_sparse_bsr) \
+_(aten, to_sparse_csc) \
+_(aten, to_sparse_csr) \
+_(aten, topk) \
+_(aten, trace) \
+_(aten, trace_backward) \
+_(aten, transpose) \
+_(aten, transpose_) \
+_(aten, transpose_copy) \
+_(aten, trapezoid) \
+_(aten, trapz) \
+_(aten, triangular_solve) \
+_(aten, tril) \
+_(aten, tril_) \
+_(aten, tril_indices) \
+_(aten, triplet_margin_loss) \
+_(aten, triu) \
+_(aten, triu_) \
+_(aten, triu_indices) \
+_(aten, true_divide) \
+_(aten, true_divide_) \
+_(aten, trunc) \
+_(aten, trunc_) \
+_(aten, type_as) \
+_(aten, unbind) \
+_(aten, unbind_copy) \
+_(aten, unflatten) \
+_(aten, unflatten_dense_tensors) \
+_(aten, unfold) \
+_(aten, unfold_backward) \
+_(aten, unfold_copy) \
+_(aten, uniform) \
+_(aten, uniform_) \
+_(aten, unique_consecutive) \
+_(aten, unique_dim) \
+_(aten, unique_dim_consecutive) \
+_(aten, unsafe_chunk) \
+_(aten, unsafe_split) \
+_(aten, unsafe_split_with_sizes) \
+_(aten, unsqueeze) \
+_(aten, unsqueeze_) \
+_(aten, unsqueeze_copy) \
+_(aten, upsample_bicubic2d) \
+_(aten, upsample_bicubic2d_backward) \
+_(aten, upsample_bilinear2d) \
+_(aten, upsample_bilinear2d_backward) \
+_(aten, upsample_linear1d) \
+_(aten, upsample_linear1d_backward) \
+_(aten, upsample_nearest1d) \
+_(aten, upsample_nearest1d_backward) \
+_(aten, upsample_nearest2d) \
+_(aten, upsample_nearest2d_backward) \
+_(aten, upsample_nearest3d) \
+_(aten, upsample_nearest3d_backward) \
+_(aten, upsample_trilinear3d) \
+_(aten, upsample_trilinear3d_backward) \
+_(aten, value_selecting_reduction_backward) \
+_(aten, values) \
+_(aten, values_copy) \
+_(aten, vander) \
+_(aten, var) \
+_(aten, var_mean) \
+_(aten, vdot) \
+_(aten, view) \
+_(aten, view_as) \
+_(aten, view_as_complex) \
+_(aten, view_as_complex_copy) \
+_(aten, view_as_real) \
+_(aten, view_as_real_copy) \
+_(aten, view_copy) \
+_(aten, vsplit) \
+_(aten, vstack) \
+_(aten, where) \
+_(aten, xlogy) \
+_(aten, xlogy_) \
+_(aten, zero) \
+_(aten, zero_) \
+_(aten, zeros) \
+_(aten, zeros_like)
+#define FORALL_ATTR_BASE_SYMBOLS(_) \
+_(attr, A) \
+_(attr, B) \
+_(attr, C) \
+_(attr, H) \
+_(attr, HxW) \
+_(attr, K) \
+_(attr, L) \
+_(attr, LD) \
+_(attr, LU) \
+_(attr, LU_data) \
+_(attr, LU_pivots) \
+_(attr, M) \
+_(attr, N) \
+_(attr, P) \
+_(attr, Q) \
+_(attr, R) \
+_(attr, S) \
+_(attr, U) \
+_(attr, UPLO) \
+_(attr, V) \
+_(attr, Vh) \
+_(attr, W) \
+_(attr, X) \
+_(attr, a) \
+_(attr, abs) \
+_(attr, accumulate) \
+_(attr, accumulate_matches) \
+_(attr, activation) \
+_(attr, addends) \
+_(attr, adjoint) \
+_(attr, alg_id) \
+_(attr, algorithm) \
+_(attr, alibi_slopes) \
+_(attr, align_corners) \
+_(attr, align_to_window) \
+_(attr, allow_tf32) \
+_(attr, alpha) \
+_(attr, amsgrad) \
+_(attr, anchor) \
+_(attr, angle) \
+_(attr, any) \
+_(attr, api_name) \
+_(attr, append) \
+_(attr, approximate) \
+_(attr, arg1) \
+_(attr, arg2) \
+_(attr, arg3) \
+_(attr, arg_out) \
+_(attr, assert_msg) \
+_(attr, assume_unique) \
+_(attr, atol) \
+_(attr, attn_bias) \
+_(attr, attn_mask) \
+_(attr, average_attn_weights) \
+_(attr, averaging_const) \
+_(attr, aweights) \
+_(attr, axis) \
+_(attr, axis0) \
+_(attr, axis1) \
+_(attr, b) \
+_(attr, b_hh) \
+_(attr, b_ih) \
+_(attr, bag_size) \
+_(attr, base) \
+_(attr, batch1) \
+_(attr, batch2) \
+_(attr, batch_dim) \
+_(attr, batch_first) \
+_(attr, batch_size) \
+_(attr, batch_sizes) \
+_(attr, benchmark) \
+_(attr, beta) \
+_(attr, beta1) \
+_(attr, beta2) \
+_(attr, bias) \
+_(attr, bias_defined) \
+_(attr, bias_g) \
+_(attr, bias_requires_grad) \
+_(attr, bias_sizes) \
+_(attr, bidirectional) \
+_(attr, bin_edges) \
+_(attr, bins) \
+_(attr, bit_width) \
+_(attr, blank) \
+_(attr, block_size) \
+_(attr, blocksize) \
+_(attr, boundaries) \
+_(attr, buffer) \
+_(attr, ccol_indices) \
+_(attr, cdim) \
+_(attr, cdist) \
+_(attr, ceil_mode) \
+_(attr, cell_state_fwd) \
+_(attr, center) \
+_(attr, ch_axis) \
+_(attr, check_errors) \
+_(attr, check_pinning) \
+_(attr, chunks) \
+_(attr, coalesced) \
+_(attr, coefficients) \
+_(attr, col) \
+_(attr, col_indices) \
+_(attr, col_offsets) \
+_(attr, col_offsets_hh) \
+_(attr, col_offsets_ih) \
+_(attr, compressed_A) \
+_(attr, compressed_idx) \
+_(attr, compressed_indices) \
+_(attr, compressed_indices_dtype) \
+_(attr, compute_log_sumexp) \
+_(attr, compute_mode) \
+_(attr, compute_uv) \
+_(attr, compute_v) \
+_(attr, condition) \
+_(attr, copy) \
+_(attr, correction) \
+_(attr, count) \
+_(attr, count_include_pad) \
+_(attr, counts) \
+_(attr, cpu_dtype) \
+_(attr, cpu_enabled) \
+_(attr, cpu_nested_shape_example) \
+_(attr, create_graph) \
+_(attr, crow_indices) \
+_(attr, cu_seqlens_k) \
+_(attr, cu_seqlens_q) \
+_(attr, cuda_dtype) \
+_(attr, cuda_enabled) \
+_(attr, cudnn_enable) \
+_(attr, cudnn_enabled) \
+_(attr, cum_seq_k) \
+_(attr, cum_seq_q) \
+_(attr, custom_mask_type) \
+_(attr, cx) \
+_(attr, cx_) \
+_(attr, cx_tmp) \
+_(attr, cy) \
+_(attr, cy_) \
+_(attr, d) \
+_(attr, dampening) \
+_(attr, data) \
+_(attr, decimals) \
+_(attr, delta) \
+_(attr, dense) \
+_(attr, dense_B) \
+_(attr, dense_dim) \
+_(attr, density) \
+_(attr, dep_token) \
+_(attr, descending) \
+_(attr, destination) \
+_(attr, deterministic) \
+_(attr, device) \
+_(attr, device_index) \
+_(attr, dgrad_glu) \
+_(attr, diagonal) \
+_(attr, diagonals) \
+_(attr, dilation) \
+_(attr, dim) \
+_(attr, dim0) \
+_(attr, dim1) \
+_(attr, dim2) \
+_(attr, dimension) \
+_(attr, dims) \
+_(attr, dims_other) \
+_(attr, dims_self) \
+_(attr, divisor_override) \
+_(attr, downscale_factor) \
+_(attr, driver) \
+_(attr, dropout) \
+_(attr, dropout_mask) \
+_(attr, dropout_p) \
+_(attr, dropout_seed) \
+_(attr, dropout_state) \
+_(attr, dst) \
+_(attr, dtype) \
+_(attr, dual) \
+_(attr, dummy) \
+_(attr, dx) \
+_(attr, edge_order) \
+_(attr, eigenvalues) \
+_(attr, eigenvectors) \
+_(attr, eigvals) \
+_(attr, eigvecs) \
+_(attr, element) \
+_(attr, elements) \
+_(attr, ellipsis_idx) \
+_(attr, embed_dim) \
+_(attr, enable_gqa) \
+_(attr, end) \
+_(attr, end_dim) \
+_(attr, eps) \
+_(attr, epsilon) \
+_(attr, equal_nan) \
+_(attr, equation) \
+_(attr, exp_avg_sqs) \
+_(attr, exp_avgs) \
+_(attr, expand1) \
+_(attr, expand2) \
+_(attr, expand3) \
+_(attr, exponent) \
+_(attr, exponential_average_factor) \
+_(attr, fake_quant_enabled) \
+_(attr, fake_quant_on) \
+_(attr, ffn_bias_1) \
+_(attr, ffn_bias_2) \
+_(attr, ffn_weight_1) \
+_(attr, ffn_weight_2) \
+_(attr, filename) \
+_(attr, fill) \
+_(attr, fill_value) \
+_(attr, flat) \
+_(attr, forward) \
+_(attr, found_inf) \
+_(attr, from) \
+_(attr, from_) \
+_(attr, full) \
+_(attr, full_matrices) \
+_(attr, fuse_transform_0213) \
+_(attr, fweights) \
+_(attr, g) \
+_(attr, gO) \
+_(attr, generator) \
+_(attr, ggI) \
+_(attr, ggW) \
+_(attr, ggb) \
+_(attr, glu) \
+_(attr, grad) \
+_(attr, grad_bias) \
+_(attr, grad_cy) \
+_(attr, grad_factor) \
+_(attr, grad_glu) \
+_(attr, grad_hy) \
+_(attr, grad_in) \
+_(attr, grad_input) \
+_(attr, grad_input_mask) \
+_(attr, grad_out) \
+_(attr, grad_out_) \
+_(attr, grad_output) \
+_(attr, grad_scale) \
+_(attr, grad_w) \
+_(attr, grad_weight) \
+_(attr, grad_x) \
+_(attr, grad_y) \
+_(attr, gradient) \
+_(attr, grads) \
+_(attr, grid) \
+_(attr, group) \
+_(attr, groups) \
+_(attr, growth_interval) \
+_(attr, growth_tracker) \
+_(attr, half_to_float) \
+_(attr, has_bias) \
+_(attr, has_biases) \
+_(attr, hermitian) \
+_(attr, hidden_bias) \
+_(attr, hidden_gates) \
+_(attr, hidden_size) \
+_(attr, high) \
+_(attr, hist) \
+_(attr, hop_length) \
+_(attr, hx) \
+_(attr, hx_) \
+_(attr, hy_) \
+_(attr, i1) \
+_(attr, i2) \
+_(attr, i3) \
+_(attr, ignore_index) \
+_(attr, imag) \
+_(attr, impl_index) \
+_(attr, implicit) \
+_(attr, in_features) \
+_(attr, include_last_offset) \
+_(attr, include_self) \
+_(attr, increasing) \
+_(attr, ind) \
+_(attr, index) \
+_(attr, index_dtype) \
+_(attr, indexing) \
+_(attr, indices) \
+_(attr, info) \
+_(attr, initial) \
+_(attr, innerKTiles) \
+_(attr, inp) \
+_(attr, input) \
+_(attr, input1) \
+_(attr, input2) \
+_(attr, input3) \
+_(attr, input_bias) \
+_(attr, input_dtype) \
+_(attr, input_g) \
+_(attr, input_gates) \
+_(attr, input_lengths) \
+_(attr, input_scale) \
+_(attr, input_size) \
+_(attr, input_sizes) \
+_(attr, input_zero_point) \
+_(attr, inputs) \
+_(attr, interpolation) \
+_(attr, interpolation_mode) \
+_(attr, inv_scale) \
+_(attr, inverse) \
+_(attr, invert) \
+_(attr, invstd) \
+_(attr, is_causal) \
+_(attr, is_coalesced) \
+_(attr, is_crow) \
+_(attr, is_first_step) \
+_(attr, is_matrix) \
+_(attr, is_result) \
+_(attr, is_target) \
+_(attr, k) \
+_(attr, keepdim) \
+_(attr, kernel_size) \
+_(attr, key) \
+_(attr, label_smoothing) \
+_(attr, lambd) \
+_(attr, largest) \
+_(attr, last_dim_size) \
+_(attr, layersOutputs) \
+_(attr, layout) \
+_(attr, left) \
+_(attr, length) \
+_(attr, lengths) \
+_(attr, level) \
+_(attr, like) \
+_(attr, list) \
+_(attr, log_alpha) \
+_(attr, log_input) \
+_(attr, log_probs) \
+_(attr, log_target) \
+_(attr, logabsdet) \
+_(attr, logsumexp) \
+_(attr, low) \
+_(attr, lower) \
+_(attr, lr) \
+_(attr, lr_decay) \
+_(attr, ltm) \
+_(attr, m) \
+_(attr, mantissa) \
+_(attr, margin) \
+_(attr, mask) \
+_(attr, mask_check) \
+_(attr, mask_type) \
+_(attr, masked_grad) \
+_(attr, mat) \
+_(attr, mat1) \
+_(attr, mat1_meta) \
+_(attr, mat2) \
+_(attr, matrices) \
+_(attr, max) \
+_(attr, max_exp_avg_sqs) \
+_(attr, max_k) \
+_(attr, max_lengths) \
+_(attr, max_norm) \
+_(attr, max_q) \
+_(attr, max_seqlen) \
+_(attr, max_seqlen_k) \
+_(attr, max_seqlen_q) \
+_(attr, max_size) \
+_(attr, max_val) \
+_(attr, max_values) \
+_(attr, maximize) \
+_(attr, maximum_indices) \
+_(attr, maxnorm) \
+_(attr, mean) \
+_(attr, median) \
+_(attr, memory_format) \
+_(attr, meta) \
+_(attr, min) \
+_(attr, min_indices) \
+_(attr, min_seqlen) \
+_(attr, min_val) \
+_(attr, minlength) \
+_(attr, mode) \
+_(attr, momentum) \
+_(attr, momentum_buffer_list) \
+_(attr, n) \
+_(attr, n_bins) \
+_(attr, n_fft) \
+_(attr, names) \
+_(attr, nan) \
+_(attr, need_weights) \
+_(attr, neg_log_likelihood) \
+_(attr, negative) \
+_(attr, negative_slope) \
+_(attr, neginf) \
+_(attr, nested_size) \
+_(attr, nested_strides) \
+_(attr, nesterov) \
+_(attr, new_data) \
+_(attr, nnz) \
+_(attr, noise) \
+_(attr, non_blocking) \
+_(attr, norm) \
+_(attr, norm_bias_1) \
+_(attr, norm_bias_2) \
+_(attr, norm_first) \
+_(attr, norm_type) \
+_(attr, norm_weight_1) \
+_(attr, norm_weight_2) \
+_(attr, normalization) \
+_(attr, normalized) \
+_(attr, normalized_shape) \
+_(attr, normalized_shape_ndim) \
+_(attr, nt_example) \
+_(attr, num_chunks) \
+_(attr, num_classes) \
+_(attr, num_generated) \
+_(attr, num_groups) \
+_(attr, num_head) \
+_(attr, num_heads) \
+_(attr, num_layers) \
+_(attr, num_parallel) \
+_(attr, num_samples) \
+_(attr, num_splits_key) \
+_(attr, num_weights) \
+_(attr, numel) \
+_(attr, observer_on) \
+_(attr, offs) \
+_(attr, offset) \
+_(attr, offset2bag) \
+_(attr, offsets) \
+_(attr, onesided) \
+_(attr, ord) \
+_(attr, order) \
+_(attr, other) \
+_(attr, out) \
+_(attr, out0) \
+_(attr, out1) \
+_(attr, out2) \
+_(attr, out3) \
+_(attr, out4) \
+_(attr, out5) \
+_(attr, out6) \
+_(attr, out_channel) \
+_(attr, out_dim) \
+_(attr, out_dtype) \
+_(attr, out_features) \
+_(attr, out_int32) \
+_(attr, outdim) \
+_(attr, output) \
+_(attr, output_mask) \
+_(attr, output_padding) \
+_(attr, output_scale) \
+_(attr, output_size) \
+_(attr, output_zero_point) \
+_(attr, p) \
+_(attr, packed) \
+_(attr, packed_hh) \
+_(attr, packed_ih) \
+_(attr, packed_weight) \
+_(attr, packed_weights) \
+_(attr, pad) \
+_(attr, pad_mode) \
+_(attr, padded) \
+_(attr, padding) \
+_(attr, padding_idx) \
+_(attr, padding_mode) \
+_(attr, padding_side) \
+_(attr, padding_value) \
+_(attr, params) \
+_(attr, path) \
+_(attr, pdist) \
+_(attr, per_row_fake_quant) \
+_(attr, per_sample_weights) \
+_(attr, periodic) \
+_(attr, philox_offset) \
+_(attr, philox_seed) \
+_(attr, physical_layout) \
+_(attr, pin_memory) \
+_(attr, pivot) \
+_(attr, pivots) \
+_(attr, plain_idx) \
+_(attr, plain_indices) \
+_(attr, pos_weight) \
+_(attr, posinf) \
+_(attr, positive) \
+_(attr, pow) \
+_(attr, prepend) \
+_(attr, primal) \
+_(attr, prob) \
+_(attr, proj_bias) \
+_(attr, proj_size) \
+_(attr, proj_weight) \
+_(attr, q) \
+_(attr, qGroupSize) \
+_(attr, qScale) \
+_(attr, qScaleAndZeros) \
+_(attr, qZeros) \
+_(attr, qkv) \
+_(attr, qkv_bias) \
+_(attr, qkv_weight) \
+_(attr, qtensor) \
+_(attr, quant_max) \
+_(attr, quant_min) \
+_(attr, quasi) \
+_(attr, query) \
+_(attr, r) \
+_(attr, ragged_idx) \
+_(attr, random_samples) \
+_(attr, range) \
+_(attr, rank) \
+_(attr, ratio) \
+_(attr, rcond) \
+_(attr, real) \
+_(attr, reduce) \
+_(attr, reduce_range) \
+_(attr, reduction) \
+_(attr, repeats) \
+_(attr, replacement) \
+_(attr, requires_grad) \
+_(attr, reserve) \
+_(attr, reserveSpace) \
+_(attr, reservedSpace) \
+_(attr, residuals) \
+_(attr, result) \
+_(attr, retain_graph) \
+_(attr, return_complex) \
+_(attr, return_counts) \
+_(attr, return_debug_mask) \
+_(attr, return_inverse) \
+_(attr, reverse) \
+_(attr, right) \
+_(attr, rng_state) \
+_(attr, rounding_mode) \
+_(attr, row) \
+_(attr, row_indices) \
+_(attr, rstd) \
+_(attr, rtol) \
+_(attr, running_max) \
+_(attr, running_mean) \
+_(attr, running_min) \
+_(attr, running_var) \
+_(attr, s) \
+_(attr, save_invstd) \
+_(attr, save_mean) \
+_(attr, save_var) \
+_(attr, save_var_transform) \
+_(attr, saved_g) \
+_(attr, saved_norms) \
+_(attr, saved_v) \
+_(attr, scalar) \
+_(attr, scalar1) \
+_(attr, scalar2) \
+_(attr, scalars) \
+_(attr, scale) \
+_(attr, scale_a) \
+_(attr, scale_b) \
+_(attr, scale_backoff_factor) \
+_(attr, scale_factors) \
+_(attr, scale_grad_by_freq) \
+_(attr, scale_growth_factor) \
+_(attr, scale_hh) \
+_(attr, scale_ih) \
+_(attr, scale_result) \
+_(attr, scales) \
+_(attr, scales_d) \
+_(attr, scales_h) \
+_(attr, scales_w) \
+_(attr, scales_zeros) \
+_(attr, sections) \
+_(attr, seed) \
+_(attr, self) \
+_(attr, self_is_result) \
+_(attr, self_num_batch_dims) \
+_(attr, self_or_result) \
+_(attr, self_sizes) \
+_(attr, seqlen_k) \
+_(attr, sequences) \
+_(attr, seqused_k) \
+_(attr, shape) \
+_(attr, shared) \
+_(attr, shared_storage_dqdkdv) \
+_(attr, shifts) \
+_(attr, side) \
+_(attr, sigma) \
+_(attr, sign) \
+_(attr, singular_values) \
+_(attr, size) \
+_(attr, sizes) \
+_(attr, skip_first) \
+_(attr, sobolstate) \
+_(attr, solution) \
+_(attr, some) \
+_(attr, sorted) \
+_(attr, sorted_sequence) \
+_(attr, sorter) \
+_(attr, source) \
+_(attr, spacing) \
+_(attr, sparse) \
+_(attr, sparse_dim) \
+_(attr, sparse_grad) \
+_(attr, split_k) \
+_(attr, split_k_mode) \
+_(attr, split_size) \
+_(attr, split_sizes) \
+_(attr, src) \
+_(attr, stable) \
+_(attr, start) \
+_(attr, start_dim) \
+_(attr, state_steps) \
+_(attr, state_sums) \
+_(attr, std) \
+_(attr, step) \
+_(attr, steps) \
+_(attr, storage_offset) \
+_(attr, stride) \
+_(attr, sum_S) \
+_(attr, sum_dy) \
+_(attr, sum_dy_xmu) \
+_(attr, sumdim) \
+_(attr, swap) \
+_(attr, symmetric_quant) \
+_(attr, t) \
+_(attr, tangent) \
+_(attr, target) \
+_(attr, target_lengths) \
+_(attr, targets) \
+_(attr, tau) \
+_(attr, tensor) \
+_(attr, tensor1) \
+_(attr, tensor2) \
+_(attr, tensor_indices_or_sections) \
+_(attr, tensors) \
+_(attr, tensors1) \
+_(attr, test_element) \
+_(attr, test_elements) \
+_(attr, the_template) \
+_(attr, theta) \
+_(attr, thread_masks) \
+_(attr, threshold) \
+_(attr, to) \
+_(attr, tol) \
+_(attr, total) \
+_(attr, total_L) \
+_(attr, total_length) \
+_(attr, total_weight) \
+_(attr, train) \
+_(attr, training) \
+_(attr, transpose) \
+_(attr, transpose_result) \
+_(attr, transposed) \
+_(attr, type1) \
+_(attr, type2) \
+_(attr, unbiased) \
+_(attr, unitriangular) \
+_(attr, unpack_data) \
+_(attr, unpack_pivots) \
+_(attr, unroll_dim) \
+_(attr, unsafe) \
+_(attr, unused) \
+_(attr, update) \
+_(attr, upper) \
+_(attr, upscale_factor) \
+_(attr, use_cutlass) \
+_(attr, use_fast_accum) \
+_(attr, use_gelu) \
+_(attr, use_input_stats) \
+_(attr, v) \
+_(attr, value) \
+_(attr, values) \
+_(attr, var) \
+_(attr, vec) \
+_(attr, vec1) \
+_(attr, vec2) \
+_(attr, w_hh) \
+_(attr, w_ih) \
+_(attr, weight) \
+_(attr, weight0) \
+_(attr, weight1) \
+_(attr, weight2) \
+_(attr, weight3) \
+_(attr, weight4) \
+_(attr, weight_arr) \
+_(attr, weight_buf) \
+_(attr, weight_decay) \
+_(attr, weight_g) \
+_(attr, weight_scale) \
+_(attr, weight_stride0) \
+_(attr, weight_zero_point) \
+_(attr, weights) \
+_(attr, win_length) \
+_(attr, window) \
+_(attr, window_length) \
+_(attr, window_size) \
+_(attr, window_size_left) \
+_(attr, window_size_right) \
+_(attr, with_replacement) \
+_(attr, workspace) \
+_(attr, wrap) \
+_(attr, x) \
+_(attr, x1) \
+_(attr, x2) \
+_(attr, y) \
+_(attr, z) \
+_(attr, z_state) \
+_(attr, zero_infinity) \
+_(attr, zero_point) \
+_(attr, zero_point_hh) \
+_(attr, zero_point_ih) \
+_(attr, zero_points)

.venv/lib/python3.12/site-packages/torch/include/ATen/core/blob.h ADDED Viewed

	@@ -0,0 +1,204 @@

+#pragma once
+#include <type_traits>
+#include <c10/util/intrusive_ptr.h>
+#include <c10/util/typeid.h>
+#include <c10/macros/Macros.h>
+namespace caffe2 {
+class Tensor;
+/**
+ * @brief Blob is a general container that hosts a typed pointer.
+ *
+ * A Blob hosts a pointer as well as its type, and takes charge of deleting it
+ * properly when the blob is deallocated or re-allocated with a new type. A blob
+ * could contain anything, although the most common case is to contain a Tensor.
+ */
+class TORCH_API Blob final : public c10::intrusive_ptr_target {
+ public:
+  /**
+   * Initializes an empty Blob.
+   */
+  Blob() noexcept = default;
+  ~Blob() override {
+    Reset();
+  }
+  Blob(Blob&& other) noexcept : Blob() {
+    swap(other);
+  }
+  Blob& operator=(Blob&& other) noexcept {
+    Blob(std::move(other)).swap(*this);
+    return *this;
+  }
+  /**
+   * Checks if the content stored in the blob is of type T.
+   */
+  template <class T>
+  bool IsType() const noexcept {
+    return meta_.Match<T>();
+  }
+  /**
+   * Returns the meta info of the blob.
+   */
+  const TypeMeta meta() const noexcept {
+    return meta_;
+  }
+  /**
+   * Returns a printable typename of the blob.
+   */
+  std::string_view TypeName() const noexcept {
+    return meta_.name();
+  }
+  /**
+   * @brief Gets the const reference of the stored object. The code checks if
+   * the stored object is of the desired type.
+   */
+  // TODO(jerryzh): add a Get(c10::DeviceType) function?
+  template <class T>
+  const T& Get() const {
+    TORCH_INTERNAL_ASSERT(
+        IsType<T>(),
+        "wrong type for the Blob instance. Blob contains ",
+        meta_.name(),
+        " while caller expects ",
+        TypeMeta::TypeName<T>());
+    // TODO: after we add Get<Tensor>(c10::DeviceType)
+    // and changed all the callsites, we can add
+    // a static assert here to enforce T != Tensor
+    return *static_cast<const T*>(pointer_);
+  }
+  const void* GetRaw() const noexcept {
+    return pointer_;
+  }
+  void* GetRaw() noexcept {
+    return pointer_;
+  }
+  /**
+   * @brief Gets a mutable pointer to the stored object.
+   *
+   * If the current object is not of the right type, a new object is created
+   * and the old object is freed. Note that type T should have a default
+   * constructor. Otherwise, create the object yourself first, and use
+   * Reset().
+   */
+  template <class T>
+  T* GetMutable() {
+    static_assert(
+        std::is_default_constructible_v<T>,
+        "GetMutable can't be called with non-default-constructible types. "
+        "Try using specialized methods");
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      // TODO Re-enable logging
+      // VLOG(1) << "Create new mutable object " << TypeMeta::TypeName<T>();
+      return Reset<T>(new T());
+    }
+  }
+  template <class T>
+  T* GetMutableOrNull() {
+    if (IsType<T>()) {
+      return static_cast<T*>(pointer_);
+    } else {
+      return nullptr;
+    }
+  }
+  /**
+   * Sets the underlying object to the allocated one. The Blob then takes over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * This is used when the underlying class T does not have a default ctor, or
+   * complex initializations needs to be done outside the blob.
+   */
+  template <class T>
+  T* Reset(T* allocated) {
+    free_();
+    meta_ = TypeMeta::Make<T>();
+    pointer_ = static_cast<void*>(allocated);
+    has_ownership_ = true;
+    return allocated;
+  }
+  /**
+   * Sets the underlying object to the allocated one, but does not take over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * Unlike Reset, this does not take over the ownership of the pointer and the
+   * caller is responsible for making sure that the lifetime of the allocated
+   * blob outlasts the lifetime of any access to this blob, until another Reset
+   * call is made or the blob is destructed.
+   */
+  template <class T>
+  std::remove_const_t<T>* ShareExternal(
+      std::remove_const_t<T>* allocated) {
+    return static_cast<T*>(ShareExternal(
+        static_cast<void*>(allocated),
+        TypeMeta::Make<std::remove_const_t<T>>()));
+  }
+  void* ShareExternal(void* allocated, const TypeMeta meta) {
+    free_();
+    meta_ = meta;
+    pointer_ = allocated;
+    has_ownership_ = false;
+    return allocated;
+  }
+  /**
+   * Resets the Blob to an empty one.
+   */
+  void Reset() {
+    free_();
+    pointer_ = nullptr;
+    meta_ = TypeMeta();
+    has_ownership_ = false;
+  }
+  /**
+   * @brief Swaps the underlying storage of two blobs.
+   */
+  void swap(Blob& rhs)  noexcept {
+    using std::swap;
+    swap(meta_, rhs.meta_);
+    swap(pointer_, rhs.pointer_);
+    swap(has_ownership_, rhs.has_ownership_);
+  }
+ private:
+  void free_() {
+    if (has_ownership_ && pointer_ != nullptr) {
+      (*meta_.deleteFn())(pointer_);
+    }
+  }
+  TypeMeta meta_;
+  void* pointer_{nullptr};
+  bool has_ownership_{false};
+  C10_DISABLE_COPY_AND_ASSIGN(Blob);
+};
+inline void swap(Blob& lhs, Blob& rhs)  noexcept {
+  lhs.swap(rhs);
+}
+inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
+  return out << "Blob[" << v.TypeName() << "]";
+}
+} // namespace caffe2

.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel.h ADDED Viewed

	@@ -0,0 +1,213 @@

+#pragma once
+#include <ATen/core/boxing/OperatorKernel.h>
+#include <c10/core/DispatchKeySet.h>
+#include <c10/util/intrusive_ptr.h>
+namespace c10 {
+struct IValue;
+using Stack = std::vector<IValue>;
+class OperatorHandle;
+class KernelFunction;
+// This kernel implements the behavior of falling through to the next available
+// registered dispatch key.  The implementation of this function is FAST; it is
+// no overhead to fallthrough to the next key.  See cpp file for some more
+// implementation notes; notably, this does NOT actually go through the
+// boxing/unboxing codepath.
+TORCH_API void fallthrough_kernel(
+    OperatorKernel*,
+    const OperatorHandle&,
+    DispatchKeySet,
+    Stack*);
+// Note [Ambiguity in AutogradOther kernel]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This error-reporting kernel is registered to the AutogradOther entry in the
+// dispatch table when there is both a CompositeImplicitAutograd kernel and a
+// backend kernel for ANY backend that maps to AutogradOther.  To see why
+// this is necessary in the AutogradOther case, it's helpful to first see
+// why everything works out fine for a backend that has a reserved Autograd
+// entry (see rule 2.2 in [Note] DispatchTable computation):
+//
+//    CPU   AutogradCPU
+//    reg?  registers with...
+//    -------------------------------------------------
+//    y     Autograd registration takes precedence
+//          over CompositeImplicitAutograd.
+//          This is good, because the CPU specific backend
+//          implementation is more specialized and typically better;
+//          if we used the composite, we would bypass it.
+//          (NB: the Autograd key is guaranteed to exist because
+//          the autograd codegen requires it!)
+//
+//    n     CompositeImplicitAutograd takes precedence.
+//          This is also good, because the Autograd
+//          registration (if it exists) would try to redispatch
+//          to the (non-existent) CPU implementation; by
+//          using the composite, we ensure the operator
+//          actually works.
+//
+// As you can see, when we have a specific Autograd key (AutogradCPU), we can
+// decide whether or not to use the CompositeImplicitAutograd kernel or the
+// Autograd kernel based on whether or not the backend kernel exists.
+//
+// However, for AutogradOther (which is the catchall autograd kernel for
+// everything that doesn't have a specific Autograd key), we can't do this
+// trick because there isn't any unique backend to peek at to disambiguate;
+// if there are some backends that have implementations they prefer Autograd,
+// but unimplemented backends would prefer CompositeImplicitAutograd.  Rather
+// than arbitrarily pick one or the other, we just register a kernel that raises
+// an error and let the user decide how to proceed.
+TORCH_API void ambiguous_autogradother_kernel(
+    OperatorKernel*,
+    const OperatorHandle&,
+    DispatchKeySet,
+    Stack*);
+// Note [named_not_supported_kernel]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This kernel implements reporting an error message saying that named tensor is
+// not supported.  This kernel doesn't rely on the Stack, and so it is special
+// cased in the dispatcher to be triggered before we attempt boxing (so we can
+// give a good error message in cases when boxing is not supported).  When
+// boxing is universally supported this can be removed.
+[[noreturn]] TORCH_API void named_not_supported_kernel(
+    OperatorKernel*,
+    const OperatorHandle&,
+    DispatchKeySet,
+    Stack*);
+/**
+ * BoxedKernel is similar to a std::function storing a boxed kernel.
+ */
+class TORCH_API BoxedKernel final {
+ public:
+  // This is how boxed kernels are actually stored
+  //
+  // Note [Plumbing Keys Through The Dispatcher]
+  // Benchmarks have shown that it is expensive for the dispatcher to read from
+  // thread-local storage (TLS) upon every dispatch call into order to compute
+  // which kernel to dispatch to.
+  //
+  // To mitigate this, we've updated the calling convention inside the
+  // dispatcher to expect every kernel that it stores to have a first argument
+  // of type DispatchKeySet.
+  //
+  // What are the invariants of the DispatchKeySet when it gets passed to a
+  // kernel?
+  // - All keys to the left of the current dispatch key have been masked out.
+  //   (e.g. a Tracing kernel that takes in the DispatchKeySet will expect the
+  //   highest bit to be DispatchKey::Tracer)
+  // - All other keys that dispatcher normally would have computed through TLS +
+  // global state + op arguments
+  //   are still in the set.
+  //
+  // Kernels can then opt into using this keyset to save the dispatcher from
+  // doing repeated work during redispatches: recalculating the highest-priority
+  // dispatch key, which involves reading from TLS. Instead, the kernels that
+  // opt in will calculate an updated DispatchKeySet directly from the old one,
+  // and pass the updated set directly into the dispatcher upon redispatching.
+  //
+  // This is an opt-in mechanism: Kernels can automatically opt in by setting
+  // the first argument in their signature to be of type DispatchKeySet. See the
+  // kernels in VariableTypeEverything.cpp and TraceTypeEverything.cpp for
+  // examples.
+  //
+  // The mechanism for optionally passing that DispatchKeySet into the kernel
+  // lives in make_boxed_from_unboxed_functor.h. See Note [Plumbing Keys Through
+  // The Dispatcher 2] for details.
+  using InternalBoxedKernelFunction =
+      void(OperatorKernel*, const OperatorHandle&, DispatchKeySet, Stack*);
+  // This is the public API for how boxed kernels are defined
+  using BoxedKernelFunction = void(const OperatorHandle&, Stack*);
+  using BoxedKernelFunction_withDispatchKeys =
+      void(const OperatorHandle&, DispatchKeySet, Stack*);
+  BoxedKernel();
+  // Fast path for dispatch to allow not touching the boxed kernel in
+  // the common case where unboxed is available.
+  bool isValid() const;
+  bool isFallthrough() const;
+  /**
+   * Call the function with boxed arguments.
+   */
+  void callBoxed(
+      const OperatorHandle& opHandle,
+      DispatchKeySet dispatchKeySet,
+      Stack* stack) const;
+  /**
+   * Create a KernelFunction from a boxed function.
+   *
+   * Example:
+   *
+   * > void boxed_func(OperatorKernel*, Stack* stack) {...}
+   * > BoxedFunction func = BoxedKernel::makeFromFunction<&boxed_func>();
+   */
+  template <BoxedKernelFunction* func>
+  static BoxedKernel makeFromFunction();
+  /**
+   * TODO: This will only be useful if we write a backend fallback that plumbs
+   * dispatch keys (currently there are none) See Note [Plumbing Keys Through
+   * The Dispatcher] for details.
+   */
+  template <BoxedKernelFunction_withDispatchKeys* func>
+  static BoxedKernel makeFromFunction();
+  /**
+   * Create a KernelFunction from a boxed functor.
+   *
+   * Example:
+   *
+   * > class MyFunctor final : public c10::OperatorKernel {
+   * >   public:
+   * >     void operator()(const OperatorHandle&, DispatchKeySet, Stack*) {...}
+   * > };
+   * > BoxedKernel func =
+   * BoxedKernel::makeFromFunctor(std::make_unique<MyFunctor>());
+   */
+  template <class KernelFunctor>
+  static BoxedKernel makeFromFunctor(
+      std::unique_ptr<KernelFunctor> kernelFunctor);
+  static BoxedKernel makeFallthrough();
+  static BoxedKernel makeAmbiguousAutogradOther();
+  static BoxedKernel makeNamedNotSupported();
+ private:
+  friend class KernelFunction;
+  template <BoxedKernelFunction* func>
+  static void make_boxed_function(
+      OperatorKernel*,
+      const OperatorHandle& opHandle,
+      DispatchKeySet,
+      Stack* stack);
+  template <BoxedKernelFunction_withDispatchKeys* func>
+  static void make_boxed_function(
+      OperatorKernel*,
+      const OperatorHandle& opHandle,
+      DispatchKeySet,
+      Stack* stack);
+  explicit BoxedKernel(
+      std::unique_ptr<OperatorKernel> functor,
+      InternalBoxedKernelFunction* boxed_kernel_func);
+  OperatorKernel* getFunctor() const;
+  InternalBoxedKernelFunction* getFnPtr() const;
+  c10::intrusive_ptr<OperatorKernel> functor_;
+  InternalBoxedKernelFunction* boxed_kernel_func_;
+};
+} // namespace c10
+#include <ATen/core/boxing/BoxedKernel_impl.h>

.venv/lib/python3.12/site-packages/torch/include/ATen/core/boxing/BoxedKernel_impl.h ADDED Viewed

	@@ -0,0 +1,106 @@

+#pragma once
+namespace c10 {
+inline BoxedKernel::BoxedKernel() : functor_(), boxed_kernel_func_(nullptr) {}
+inline BoxedKernel::BoxedKernel(
+    std::unique_ptr<OperatorKernel> functor,
+    InternalBoxedKernelFunction* boxed_kernel_func)
+    : functor_(std::move(functor)), boxed_kernel_func_(boxed_kernel_func) {}
+template <BoxedKernel::BoxedKernelFunction* func>
+inline void BoxedKernel::make_boxed_function(
+    OperatorKernel*,
+    const OperatorHandle& opHandle,
+    DispatchKeySet,
+    Stack* stack) {
+  // Note that we're dropping the DispatchKeySet argument.
+  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+  func(opHandle, stack);
+}
+template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+inline void BoxedKernel::make_boxed_function(
+    OperatorKernel*,
+    const OperatorHandle& opHandle,
+    DispatchKeySet ks,
+    Stack* stack) {
+  // See Note [Plumbing Keys Through The Dispatcher 2] for details.
+  func(opHandle, ks, stack);
+}
+inline bool BoxedKernel::isValid() const {
+  return boxed_kernel_func_ != nullptr;
+}
+inline bool BoxedKernel::isFallthrough() const {
+  return boxed_kernel_func_ == &fallthrough_kernel;
+}
+inline void BoxedKernel::callBoxed(
+    const OperatorHandle& opHandle,
+    DispatchKeySet dispatchKeySet,
+    Stack* stack) const {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      boxed_kernel_func_ != nullptr,
+      "Tried to call BoxedKernel::callBoxed() on an uninitialized BoxedKernel.");
+  (*boxed_kernel_func_)(functor_.get(), opHandle, dispatchKeySet, stack);
+}
+template <BoxedKernel::BoxedKernelFunction* func>
+inline BoxedKernel BoxedKernel::makeFromFunction() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &make_boxed_function<func>);
+}
+template <BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+inline BoxedKernel BoxedKernel::makeFromFunction() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &make_boxed_function<func>);
+}
+inline BoxedKernel BoxedKernel::makeFallthrough() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &fallthrough_kernel);
+}
+inline BoxedKernel BoxedKernel::makeAmbiguousAutogradOther() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &ambiguous_autogradother_kernel);
+}
+inline BoxedKernel BoxedKernel::makeNamedNotSupported() {
+  return BoxedKernel(
+      nullptr, // no functor_ object
+      &named_not_supported_kernel);
+}
+template <class KernelFunctor>
+inline BoxedKernel BoxedKernel::makeFromFunctor(
+    std::unique_ptr<KernelFunctor> kernelFunctor) {
+  static_assert(
+      std::is_base_of_v<OperatorKernel, KernelFunctor>,
+      "Tried to call BoxedKernel::makeFromFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
+  return BoxedKernel(
+      std::move(kernelFunctor),
+      [](OperatorKernel* kernel,
+         const OperatorHandle& op,
+         DispatchKeySet ks,
+         Stack* stack) {
+        (*static_cast<KernelFunctor*>(kernel))(op, ks, stack);
+      });
+}
+inline OperatorKernel* BoxedKernel::getFunctor() const {
+  return functor_.get();
+}
+inline BoxedKernel::InternalBoxedKernelFunction* BoxedKernel::getFnPtr() const {
+  return boxed_kernel_func_;
+}
+} // namespace c10