diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AmpKernels.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AmpKernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..c463c80e1c6dcfff66bde315d59bf7fcb73e9860
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/AmpKernels.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using _amp_foreach_non_finite_check_and_unscale_cpu__fn = void (*)(
+    TensorList,
+    Tensor&,
+    const Tensor&);
+
+using _amp_update_scale_cpu__fn = Tensor& (*)(
+    Tensor&,
+    Tensor&,
+    const Tensor&,
+    double,
+    double,
+    int64_t);
+
+DECLARE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu__fn, _amp_foreach_non_finite_check_and_unscale_cpu_stub);
+DECLARE_DISPATCH(_amp_update_scale_cpu__fn, _amp_update_scale_cpu_stub);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..90747c264b1564d072d8f57fdd38b2ef743ea62c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BucketizationUtils.h
@@ -0,0 +1,173 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/TypeProperties.h>
+#include <ATen/ScalarOps.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/result_type.h>
+#endif
+
+namespace at::native {
+
+// original values given by raw_*. If an original value is not contiguous, will make a contiguous copy to
+// the corresponding trimmed_* value. Additionally, if the dtypes of the boundary and input tensor do not
+// match, will change them to be a common super type so comparisons are done between the same types.
+// For any trimmed_* tensor, if its outgoing value matches what it was incoming (typically null), then the
+// corresponding raw_* version should be used since it was already contiguous of the right type.
+inline void searchsorted_maybe_trim_input_tensors(
+    Tensor& trimmed_input,
+    Tensor& trimmed_boundaries,
+    Tensor& trimmed_sorter,
+    const Tensor& raw_input,
+    const Tensor& raw_boundaries,
+    const Tensor& raw_sorter) {
+  bool in_is_contiguous = raw_input.is_contiguous();
+  bool bd_is_contiguous = raw_boundaries.is_contiguous();
+  bool sort_is_contiguous = raw_sorter.is_contiguous();
+
+  if (!in_is_contiguous) {
+    TORCH_WARN_ONCE("torch.searchsorted(): input value tensor is non-contiguous, this will lower the performance due "
+      "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous input value "
+      "tensor if possible. This message will only appear once per program.");
+    trimmed_input = raw_input.contiguous();
+  }
+  if (!bd_is_contiguous) {
+    TORCH_WARN_ONCE("torch.searchsorted(): boundary tensor is non-contiguous, this will lower the performance due "
+      "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous boundary "
+      "tensor if possible. This message will only appear once per program.");
+    trimmed_boundaries = raw_boundaries.contiguous();
+  }
+  if (!sort_is_contiguous) {
+    TORCH_WARN_ONCE("torch.searchsorted(): sorter tensor is non-contiguous, this will lower the performance due "
+      "to extra data copy when converting non-contiguous tensor to contiguous, please use contiguous sorter "
+      "tensor if possible. This message will only appear once per program.");
+    trimmed_sorter = raw_sorter.contiguous();
+  }
+  if (raw_input.dtype() != raw_boundaries.dtype()) {
+    at::native::ResultTypeState state = {};
+    state = at::native::update_result_type_state(raw_boundaries, state);
+    state = at::native::update_result_type_state(raw_input, state);
+    ScalarType common_stype = at::native::result_type(state);
+
+    TORCH_INTERNAL_ASSERT(common_stype != ScalarType::Undefined);
+    if (common_stype != raw_input.scalar_type()) {
+      trimmed_input = in_is_contiguous ? raw_input.to(common_stype) : trimmed_input.to(common_stype);
+    }
+    if (common_stype != raw_boundaries.scalar_type()) {
+      trimmed_boundaries = bd_is_contiguous ? raw_boundaries.to(common_stype) : trimmed_boundaries.to(common_stype);
+    }
+  }
+}
+
+/* unused but needed for internal jagged tensor class */
+inline void searchsorted_maybe_trim_input_tensors(
+    Tensor& trimmed_input,
+    Tensor& trimmed_boundaries,
+    const Tensor& raw_input,
+    const Tensor& raw_boundaries) {
+  Tensor trimmed_sorter;
+  Tensor raw_sorter;
+  return searchsorted_maybe_trim_input_tensors(
+      trimmed_input,
+      trimmed_boundaries,
+      trimmed_sorter,
+      raw_input,
+      raw_boundaries,
+      raw_sorter);
+}
+
+inline bool searchsorted_dims_matched_before_last_dim(const Tensor& boundaries, const Tensor& input) {
+  if (boundaries.dim() != input.dim()) {
+    return false;
+  }
+  const auto& dims_bd = boundaries.sizes();
+  const auto& dims_in = input.sizes();
+  for (int64_t dim = 0; dim + 1 < boundaries.dim(); ++dim) {
+    if (dims_bd[dim] != dims_in[dim]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline Tensor searchsorted_scalar_tensor(const Scalar& scalar, const c10::Device& device) {
+  auto tensor = c10::scalar_to_tensor(scalar, device);
+  // This is to adopt the scalar promotion rules defined in native/TypeProperties.h
+  // So we have the same type promotion rules as binary operations.
+  tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
+  return tensor;
+}
+
+inline void searchsorted_pre_check(
+    const Tensor& boundaries,
+    const Tensor& input,
+    const Tensor& output,
+    const bool out_int32,
+    const bool right,
+    const std::optional<c10::string_view> side_opt,
+    const Tensor& sorter) {
+  if (side_opt) {
+    const c10::string_view side = *side_opt;
+    TORCH_CHECK(side == "left" || side == "right", "torch.searchsorted(): side can only be 'left' or 'right' but ",
+      "got ", side);
+
+    // assume the user has not explicitly set (right=False, side="right")
+    TORCH_CHECK(!right || side == "right", "torch.searchsorted(): side and right can't be set to opposites, got side "
+    "of ", side, " while right was True");
+  }
+
+  TORCH_CHECK(boundaries.device() == input.device(), "torch.searchsorted(): boundaries and input value tensors ",
+    "should have same device type, but got boundaries tensor device type ", boundaries.device(), " and input value ",
+    "tensor device type ", input.device());
+
+  if (sorter.defined()) {
+    TORCH_CHECK(sorter.device() == boundaries.device(), "torch.searchsorted(): sorter and boundary tensors should ",
+      "have same device type, but got sorter tensor device type ", sorter.device(), " and input value tensor ",
+      "device type ", boundaries.device());
+
+    TORCH_CHECK(sorter.sizes() == boundaries.sizes(), "torch.searchsorted(): boundary and sorter must have the same "
+      "size, but got boundary tensor ", boundaries.sizes(), "and got sorter tensor ", sorter.sizes());
+
+    TORCH_CHECK(sorter.scalar_type() == ScalarType::Long, "torch.searchsorted(): sorter must be a tensor of long ",
+      "dtype but got dtype ", sorter.scalar_type());
+
+    if (sorter.numel() > 0) {
+      auto minmax = sorter.aminmax();
+      int64_t vmin = std::get<0>(minmax).item().toLong();
+      int64_t vmax = std::get<1>(minmax).item().toLong();
+      TORCH_CHECK(vmin >= 0 && vmax < sorter.sizes().back(), "torch.searchsorted(): sorter index out of range");
+    }
+  }
+
+  TORCH_CHECK(input.dim() > 0 || (input.dim() == 0 && input.numel() == 1 && boundaries.dim() == 1),
+    "torch.searchsorted(): input value can be a scalar only when boundaries tensor dimension is 1, but we got ",
+    "boundaries tensor dim(", boundaries.dim(), ") and input value's dim(", input.dim(), ") numel(",
+    input.numel(), ")");
+
+  TORCH_CHECK(boundaries.dim() != 0, "torch.searchsorted(): boundaries tensor should have positive dimension, but ",
+    "got 0 dimension");
+
+  TORCH_CHECK(boundaries.dim() == 1 || searchsorted_dims_matched_before_last_dim(boundaries, input),
+    "torch.searchsorted(): boundaries tensor should be 1 dimension or the first N-1 dimensions of boundaries tensor ",
+    "and input value tensor must match, but we got boundaries tensor ", boundaries.sizes(), " and input value tensor ",
+    input.sizes());
+
+  ScalarType output_dtype = output.scalar_type();
+  TORCH_CHECK(
+      (output_dtype == ScalarType::Long && !out_int32) ||
+          (output_dtype == ScalarType::Int && out_int32),
+      "torch.searchsorted(): output tensor's dtype is wrong, it can only be Int(int32) or Long(int64) depending on ",
+      "whether out_int32 flag is True, but we got output tensor's dtype ", output_dtype,
+      " and out_int32 flag is ", (out_int32 ? "True" : "False"));
+
+  if (out_int32) {
+    TORCH_CHECK(boundaries.sizes().back() < INT_MAX,
+      "torch.searchsorted(): the size of boundaries' last dimension should be less than ", INT_MAX, ", but we got ",
+      boundaries.sizes().back());
+  }
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUBlas.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUBlas.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad209329c95eccf1a5203eae53132ffc7178322e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUBlas.h
@@ -0,0 +1,226 @@
+#pragma once
+
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/TransposeType.h>
+#include <c10/util/complex.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/Scalar.h>
+
+
+namespace at::native::cpublas {
+
+namespace internal {
+void normalize_last_dims(
+  TransposeType transa, TransposeType transb,
+  int64_t m, int64_t n, int64_t k,
+  int64_t *lda, int64_t *ldb, int64_t *ldc);
+}  // namespace internal
+
+using gemm_fn = void(*)(
+    at::ScalarType type,
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const Scalar& alpha,
+    const void *a, int64_t lda,
+    const void *b, int64_t ldb,
+    const Scalar& beta,
+    void *c, int64_t ldc);
+
+DECLARE_DISPATCH(gemm_fn, gemm_stub);
+
+template <typename scalar_t>
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    at::opmath_type<scalar_t> alpha,
+    const scalar_t *a, int64_t lda,
+    const scalar_t *b, int64_t ldb,
+    at::opmath_type<scalar_t> beta,
+    scalar_t *c, int64_t ldc) {
+  internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
+  gemm_stub(
+    kCPU, c10::CppTypeToScalarType<scalar_t>::value,
+    transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    double alpha,
+    const double *a, int64_t lda,
+    const double *b, int64_t ldb,
+    double beta,
+    double *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const float *a, int64_t lda,
+    const float *b, int64_t ldb,
+    float beta,
+    float *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const at::BFloat16 *a, int64_t lda,
+    const at::BFloat16 *b, int64_t ldb,
+    float beta,
+    at::BFloat16 *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const float alpha,
+    const at::BFloat16 *a, int64_t lda,
+    const at::BFloat16 *b, int64_t ldb,
+    const float beta,
+    float *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    float alpha,
+    const at::Half *a, int64_t lda,
+    const at::Half *b, int64_t ldb,
+    float beta,
+    at::Half *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    const float alpha,
+    const at::Half *a, int64_t lda,
+    const at::Half *b, int64_t ldb,
+    const float beta,
+    float *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    c10::complex<double> alpha,
+    const c10::complex<double> *a, int64_t lda,
+    const c10::complex<double> *b, int64_t ldb,
+    c10::complex<double> beta,
+    c10::complex<double> *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    c10::complex<float> alpha,
+    const c10::complex<float> *a, int64_t lda,
+    const c10::complex<float> *b, int64_t ldb,
+    c10::complex<float> beta,
+    c10::complex<float> *c, int64_t ldc);
+
+void gemm(
+    TransposeType transa, TransposeType transb,
+    int64_t m, int64_t n, int64_t k,
+    int64_t alpha,
+    const int64_t *a, int64_t lda,
+    const int64_t *b, int64_t ldb,
+    int64_t beta,
+    int64_t *c, int64_t ldc);
+
+template <typename scalar_t>
+void gemm_batched(
+    TransposeType transa, TransposeType transb,
+    int64_t batch_size, int64_t m, int64_t n, int64_t k,
+    scalar_t alpha,
+    const scalar_t * const *a, int64_t lda,
+    const scalar_t * const *b, int64_t ldb,
+    const scalar_t beta,
+    scalar_t * const *c, int64_t ldc);
+
+template <typename scalar_t>
+void gemm_batched_with_stride(
+    TransposeType transa, TransposeType transb,
+    int64_t batch_size, int64_t m, int64_t n, int64_t k,
+    scalar_t alpha,
+    const scalar_t *a, int64_t lda, int64_t batch_stride_a,
+    const scalar_t *b, int64_t ldb, int64_t batch_stride_b,
+    scalar_t beta,
+    scalar_t *c, int64_t ldc, int64_t batch_stride_c);
+
+using axpy_fn = void(*)(at::ScalarType type, int64_t n, const Scalar& a, const void *x, int64_t incx, void *y, int64_t incy);
+
+DECLARE_DISPATCH(axpy_fn, axpy_stub);
+
+template<typename scalar_t>
+void axpy(int64_t n, scalar_t a, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy){
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+  axpy_stub(
+      kCPU, c10::CppTypeToScalarType<scalar_t>::value,
+      n, a, x, incx, y, incy);
+}
+
+void axpy(int64_t n, double a, const double *x, int64_t incx, double *y, int64_t incy);
+void axpy(int64_t n, float a, const float *x, int64_t incx, float *y, int64_t incy);
+void axpy(int64_t n, c10::complex<double> a, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
+void axpy(int64_t n, c10::complex<float> a, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);
+
+using copy_fn = void(*)(at::ScalarType type, int64_t n, const void *x, int64_t incx, void *y, int64_t incy);
+
+DECLARE_DISPATCH(copy_fn, copy_stub);
+
+template<typename scalar_t>
+void copy(int64_t n, const scalar_t *x, int64_t incx, scalar_t *y, int64_t incy) {
+  if(n == 1)
+  {
+    incx = 1;
+    incy = 1;
+  }
+  copy_stub(
+      kCPU, c10::CppTypeToScalarType<scalar_t>::value,
+      n, x, incx, y, incy);
+}
+
+void copy(int64_t n, const double *x, int64_t incx, double *y, int64_t incy);
+void copy(int64_t n, const float *x, int64_t incx, float *y, int64_t incy);
+void copy(int64_t n, const c10::complex<double> *x, int64_t incx, c10::complex<double> *y, int64_t incy);
+void copy(int64_t n, const c10::complex<float> *x, int64_t incx, c10::complex<float> *y, int64_t incy);
+
+// Batch-reduce GEMM
+// Operates by the following formula:
+// C = alpha * SUM(A[i] x B[i]) + beta * C, i = 0 to batch size
+// A Base pointer to a tensor A.
+// B Base pointer to a tensor B.
+// C Pointer to a tensor C (accumulation buffer).
+TORCH_API void brgemm(
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t ld_a,
+    int64_t ld_b,
+    int64_t ld_c,
+    const float alpha,
+    const float beta,
+    const at::Half* A,
+    const at::Half* B,
+    float* C);
+
+// Release brgemm hardware context
+void brgemm_release();
+
+// Pack B matrix to get better performance if needed
+void pack(
+    int64_t K,
+    int64_t N,
+    int64_t ld_in,
+    int64_t ld_out,
+    ScalarType dt_in,
+    ScalarType dt_out,
+    const void* in,
+    void* out);
+
+// Whether pack is needed in the platform.
+bool need_pack(ScalarType dt_in);
+
+} // namespace at::native::cpublas
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUFallback.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUFallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..44cb534b8db2c9b063bef6b6e2c24f16db0ce454
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CPUFallback.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/util/Metaprogramming.h>
+#include <torch/library.h>
+
+namespace at::native {
+
+// This function implements a boxed fallback to CPU.
+// External backends can add their own custom logging on top if it to customize their own CPU fallbacks.
+TORCH_API void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool error_on_views = false,
+                            c10::DispatchKey cpu_dispatch_key = c10::DispatchKey::CPU);
+
+// This is a helper function that backends can use to directly call their boxed CPU fallback
+// TODO: update and add a usage example after https://github.com/pytorch/pytorch/pull/58092 lands.
+template<c10::KernelFunction::BoxedKernelFunction* fallback_fn, class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _call_fallback_fn final {};
+
+template<c10::KernelFunction::BoxedKernelFunction* fallback_fn, class Op, bool symint, class ReturnType, class... ParameterTypes>
+struct _call_fallback_fn<fallback_fn, Op, symint, ReturnType(ParameterTypes...)> final {
+    static ReturnType call(typename c10::maybe_keep_symint<symint, ParameterTypes>::type... args) {
+        auto op = c10::Dispatcher::singleton()
+            // TODO: figure out how to make compiler happy without dynamic casts
+            .findSchemaOrThrow((const char*) Op::name, (const char*) Op::overload_name)
+            //.findSchemaOrThrow("a", "b")
+            .typed<ReturnType (typename c10::maybe_keep_symint<symint, ParameterTypes>::type...)>();
+        return c10::impl::BoxedKernelWrapper<ReturnType (typename c10::maybe_keep_symint<symint, ParameterTypes>::type...)>::call(
+            c10::BoxedKernel::makeFromFunction<fallback_fn>(),
+            op,
+            c10::DispatchKeySet(), // we know that the cpu_fallback doesn't use the dispatch keyset.
+            // TODO: get std::forward<> to work
+            args...
+            );
+    }
+};
+
+template<c10::KernelFunction::BoxedKernelFunction* fallback_fn, class Op>
+using call_fallback_fn_symint = _call_fallback_fn<fallback_fn, Op, true, typename Op::schema>;
+
+template<c10::KernelFunction::BoxedKernelFunction* fallback_fn, class Op>
+using call_fallback_fn = _call_fallback_fn<fallback_fn, Op, false, typename Op::schema>;
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h
new file mode 100644
index 0000000000000000000000000000000000000000..db9742e04021e6fa6942c540c28f4ca6ff90d5df
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CanUse32BitIndexMath.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <limits>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at::native {
+
+TORCH_API bool canUse32BitIndexMath(const at::TensorBase &t, int64_t max_elem=std::numeric_limits<int32_t>::max());
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..970b7da5cb70931ccb450a6ec24d511f975248c6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <ATen/native/CompositeRandomAccessorCommon.h>
+
+namespace at::native {
+
+struct TupleInfoCPU {
+  template <typename ...Types>
+  using tuple = std::tuple<Types...>;
+
+  template <typename ...Types>
+  static constexpr auto tie(Types&... args) noexcept {
+    return std::tie(args...);
+  }
+};
+
+template <typename KeyAccessor, typename ValueAccessor>
+using CompositeRandomAccessorCPU =
+  CompositeRandomAccessor<KeyAccessor, ValueAccessor, TupleInfoCPU>;
+
+template <typename Values, typename References>
+void swap(
+  references_holder<Values, References> rh1,
+  references_holder<Values, References> rh2
+) {
+  return std::swap(rh1.data(), rh2.data());
+}
+
+template <int N, typename Values, typename References>
+auto get(references_holder<Values, References> rh) -> decltype(std::get<N>(rh.data())) {
+  return std::get<N>(rh.data());
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h
new file mode 100644
index 0000000000000000000000000000000000000000..9111c3515afcefec2d81a261737ec28bcae00cdc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessorCommon.h
@@ -0,0 +1,263 @@
+#include <utility>
+
+#pragma once
+
+namespace at::native {
+
+namespace {
+
+// operator_brackets_proxy is used in
+// CompositeRandomAccessor in place of operator[].
+// For some iterators, references returned by operator[]
+// could become invalid, operator_brackets_proxy tries to
+// resolve that by making accessor[n] to be equivalent to
+// *(accessor + n).
+template <typename Accessor>
+class operator_brackets_proxy {
+  using reference = typename std::iterator_traits<Accessor>::reference;
+  using value_type = typename std::iterator_traits<Accessor>::value_type;
+
+public:
+  C10_HOST_DEVICE
+  operator_brackets_proxy(Accessor const& accessor)
+    : accessor(accessor)
+  {}
+
+  C10_HOST_DEVICE
+  operator reference() {
+    return *accessor;
+  }
+
+  C10_HOST_DEVICE
+  reference operator*() {
+    return *accessor;
+  }
+
+  C10_HOST_DEVICE
+  operator_brackets_proxy& operator=(value_type const& val) {
+    *accessor = val;
+    return *this;
+  }
+
+private:
+  Accessor accessor;
+};
+
+}
+
+// references_holder is used as a surrogate for the
+// references type from std::iterator_traits in CompositeRandomAccessor.
+// It is assumed in CompositeRandomAccessor that
+// References = tuple<Types&...>,
+// Values = tuple<Types...> by default,
+// but they could be anything as long as References could be
+// cast to Values.
+// If you plan to use it with STL, for example, you will need to
+// define 'swap` and `get`(aka std::get) methods.
+template <typename Values, typename References>
+class references_holder {
+public:
+  using values = Values;
+  using references = References;
+
+  C10_HOST_DEVICE
+  references_holder(references refs)
+    : refs{std::move(refs)}
+  {}
+
+  C10_HOST_DEVICE
+  operator references() {
+    return refs;
+  }
+
+  C10_HOST_DEVICE
+  operator values() {
+    return refs;
+  }
+
+  C10_HOST_DEVICE
+  references_holder& operator=(values vals) {
+    refs = vals;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  references& data() {
+    return refs;
+  }
+
+protected:
+  references refs;
+};
+
+// CompositeRandomAccessor is essentially a simplified version of
+// a random access iterator over two random access iterators.
+// TupleInfo should contain a variadic type `tuple`, and a method `tie`,
+// which constructs a tuple of references from a variadic list of arguments.
+template <typename KeyAccessor, typename ValueAccessor, typename TupleInfo>
+class CompositeRandomAccessor {
+  using self_type = CompositeRandomAccessor<KeyAccessor, ValueAccessor, TupleInfo>;
+
+  using key_accessor_value_type =
+    typename std::iterator_traits<KeyAccessor>::value_type;
+  using value_accessor_value_type =
+    typename std::iterator_traits<ValueAccessor>::value_type;
+  using key_accessor_reference_type =
+    typename std::iterator_traits<KeyAccessor>::reference;
+  using value_accessor_reference_type =
+    typename std::iterator_traits<ValueAccessor>::reference;
+
+  using composite_value_type = typename TupleInfo::template tuple<
+    key_accessor_value_type,
+    value_accessor_value_type>;
+  using composite_reference = typename TupleInfo::template tuple<
+    key_accessor_reference_type,
+    value_accessor_reference_type>;
+
+public:
+  using value_type = composite_value_type;
+  using reference = references_holder<composite_value_type, composite_reference>;
+  // Note that CompositeRandomAccessor does not hold key and values
+  // in a specific datastructure, which means that a pointer to a (key, value)
+  // is not defined. Hence we just use a pointer type of the KeyAccessor.
+  using pointer = typename std::iterator_traits<KeyAccessor>::pointer;
+  using difference_type = typename std::iterator_traits<KeyAccessor>::difference_type;
+  using iterator_category = std::random_access_iterator_tag;
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor() = default;
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor(KeyAccessor keys, ValueAccessor values)
+    : keys(keys), values(values)
+  {}
+
+  // Pointer-like operations {
+  C10_HOST_DEVICE
+  reference operator*() const {
+    return TupleInfo::tie(*keys, *values);
+  }
+
+  // operator->() is supposed to return a pointer type.
+  // Since CompositeRandomAccessor does not hold pointers to pairs,
+  // we just return a pointer to a key.
+  C10_HOST_DEVICE
+  auto* operator->() const {
+    return keys.operator->();
+  }
+
+  C10_HOST_DEVICE
+  reference operator[](difference_type idx) {
+    return operator_brackets_proxy<self_type>(
+      CompositeRandomAccessor(keys + idx, values + idx)
+    );
+  }
+  // }
+
+  // Prefix/postfix increment/decrement {
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator++() {
+    ++keys;
+    ++values;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator++(int) {
+    CompositeRandomAccessor copy(*this);
+    ++*this;
+    return copy;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator--() {
+    --keys;
+    --values;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator--(int) {
+    CompositeRandomAccessor copy(*this);
+    --*this;
+    return copy;
+  }
+  // }
+
+  // Arithmetic operations {
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator+=(difference_type offset) {
+    keys += offset;
+    values += offset;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator+(difference_type offset) const {
+    return CompositeRandomAccessor(keys + offset, values + offset);
+  }
+
+  C10_HOST_DEVICE
+  friend CompositeRandomAccessor operator+(
+    difference_type offset,
+    const CompositeRandomAccessor& accessor
+  ) {
+    return accessor + offset;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor& operator-=(difference_type offset) {
+    keys -= offset;
+    values -= offset;
+    return *this;
+  }
+
+  C10_HOST_DEVICE
+  CompositeRandomAccessor operator-(difference_type offset) const {
+    return CompositeRandomAccessor(keys - offset, values - offset);
+  }
+
+  C10_HOST_DEVICE
+  difference_type operator-(const CompositeRandomAccessor& other) const {
+    return keys - other.keys;
+  }
+  // }
+
+  // Comparison operators {
+  C10_HOST_DEVICE
+  bool operator==(const CompositeRandomAccessor& other) const {
+    return keys == other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator!=(const CompositeRandomAccessor& other) const {
+    return keys != other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator<(const CompositeRandomAccessor& other) const {
+    return keys < other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator<=(const CompositeRandomAccessor& other) const {
+    return keys <= other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator>(const CompositeRandomAccessor& other) const {
+    return keys > other.keys;
+  }
+
+  C10_HOST_DEVICE
+  bool operator>=(const CompositeRandomAccessor& other) const {
+    return keys >= other.keys;
+  }
+  // }
+
+protected:
+  KeyAccessor keys;
+  ValueAccessor values;
+};
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad05a820f7b275096a3fe933e60bfd7538ef84bd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvUtils.h
@@ -0,0 +1,449 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/env.h>
+#include <c10/util/irange.h>
+
+#include <utility>
+
+namespace at::native {
+
+using conv_depthwise2d_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 2>);
+DECLARE_DISPATCH(conv_depthwise2d_backward_fn, conv_depthwise2d_backward_stub);
+using conv_depthwise3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
+DECLARE_DISPATCH(conv_depthwise3d_backward_fn, conv_depthwise3d_backward_stub);
+using cudnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
+DECLARE_DISPATCH(cudnn_convolution_backward_fn, cudnn_convolution_backward_stub);
+using mps_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mps_convolution_backward_fn, mps_convolution_backward_stub);
+using cudnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, bool, std::array<bool,2>);
+DECLARE_DISPATCH(cudnn_convolution_transpose_backward_fn, cudnn_convolution_transpose_backward_stub);
+using miopen_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_convolution_backward_fn, miopen_convolution_backward_stub);
+using miopen_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_convolution_transpose_backward_fn, miopen_convolution_transpose_backward_stub);
+using miopen_depthwise_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, bool, bool, std::array<bool,3>);
+DECLARE_DISPATCH(miopen_depthwise_convolution_backward_fn, miopen_depthwise_convolution_backward_stub);
+using mkldnn_convolution_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mkldnn_convolution_backward_fn, mkldnn_convolution_backward_stub);
+using mkldnn_convolution_transpose_fn = Tensor(*)(const Tensor&, const Tensor&, const std::optional<Tensor>&,
+    IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t);
+DECLARE_DISPATCH(mkldnn_convolution_transpose_fn, mkldnn_convolution_transpose_stub);
+using mkldnn_convolution_transpose_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, int64_t, std::array<bool,3>);
+DECLARE_DISPATCH(mkldnn_convolution_transpose_backward_fn, mkldnn_convolution_transpose_backward_stub);
+using slow_conv_dilated2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
+DECLARE_DISPATCH(slow_conv_dilated2d_backward_fn, slow_conv_dilated2d_backward_stub);
+using slow_conv_dilated3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, std::array<bool, 3>);
+DECLARE_DISPATCH(slow_conv_dilated3d_backward_fn, slow_conv_dilated3d_backward_stub);
+using slow_conv_transpose2d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,3>);
+DECLARE_DISPATCH(slow_conv_transpose2d_backward_fn, slow_conv_transpose2d_backward_stub);
+using slow_conv_transpose3d_backward_fn = std::tuple<at::Tensor,at::Tensor,at::Tensor>(*)(
+    const at::Tensor&, const at::Tensor&, const at::Tensor&, at::IntArrayRef, at::IntArrayRef,
+    at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, std::array<bool,3>);
+DECLARE_DISPATCH(slow_conv_transpose3d_backward_fn, slow_conv_transpose3d_backward_stub);
+
+namespace {
+  bool is_cudnnv8_heuristic_mode_b() {
+    static const bool is_cudnnv8_heuristic_mode_b = c10::utils::check_env("TORCH_CUDNN_USE_HEURISTIC_MODE_B") == true;
+    return is_cudnnv8_heuristic_mode_b;
+  }
+}
+
+inline bool cudnnv8_enabled_check_debug() {
+  static bool cudnnv8_flag = c10::utils::check_env("TORCH_CUDNN_V8_API_DISABLED") != true;
+  static bool cudnnv8_debug = c10::utils::check_env("TORCH_CUDNN_V8_API_DEBUG") == true;
+  static uint8_t cudnnv8_debugcount = 0;
+  if (cudnnv8_debug == 1 && cudnnv8_debugcount < 10) {
+    TORCH_WARN("TORCH_CUDNN_V8_DEBUG ON, V8 ON: ", cudnnv8_flag, " TORCH_CUDNN_USE_HEURISTIC_MODE B: ", is_cudnnv8_heuristic_mode_b());
+    cudnnv8_debugcount++;
+  }
+  return cudnnv8_flag == 1;
+}
+
+inline bool cudnnv8_use_heur_mode_b() {
+  return is_cudnnv8_heuristic_mode_b();
+}
+
+// Keep in sync with py::enum_ in Module.cpp
+enum class ConvBackend {
+  CudaDepthwise2d,
+  CudaDepthwise3d,
+  Cudnn,
+  CudnnTranspose,
+  Empty,
+  Miopen,
+  MiopenDepthwise,
+  MiopenTranspose,
+  Mkldnn,
+  MkldnnTranspose,
+  MkldnnEmpty,
+  NnpackSpatial,
+  Overrideable,
+  Slow2d,
+  Slow3d,
+  SlowDilated2d,
+  SlowDilated3d,
+  SlowTranspose2d,
+  SlowTranspose3d,
+  Winograd3x3Depthwise,
+  Xnnpack2d,
+  Mps,
+  MpsTranspose,
+};
+
+// Overload for selecting the convolution backend from the full set of convolution inputs.
+// This overload is exposed to python for testing, etc.
+TORCH_API ConvBackend select_conv_backend(
+    const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt,
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation,
+    bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
+
+TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
+    const Tensor& weight,
+    const ConvBackend backend);
+
+// ---------------------------------------------------------------------
+//
+// Math
+//
+// ---------------------------------------------------------------------
+
+constexpr int input_batch_size_dim = 0;  // also grad_input
+constexpr int input_channels_dim = 1;
+constexpr int output_batch_size_dim = 0;  // also grad_output
+constexpr int output_channels_dim = 1;
+constexpr int weight_output_channels_dim = 0;
+constexpr int weight_input_channels_dim = 1;
+
+// Often written as 2 + max_dim (extra dims for batch size and channels)
+constexpr int max_dim = 3;
+
+// ---------------------------------------------------------------------
+//
+// Checking
+//
+// ---------------------------------------------------------------------
+
+// Used on pad, stride and dilation
+static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
+{
+  TORCH_CHECK(args.size() <= expected_size,
+           "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+  TORCH_CHECK(args.size() >= expected_size,
+           "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
+           expected_size, " (while checking arguments for ", c, ")");
+
+  auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
+  if (num_negative_values > 0){
+    std::stringstream ss;
+    ss << arg_name << " should be greater than zero but got (";
+    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    AT_ERROR(ss.str());
+  }
+}
+
+
+// NOTE [ Convolution checks ]
+//
+// NB: For many call sites, it is not strictly necessary to check all of
+// these relationships (for example, for forward convolution, we compute
+// the size of output ourselves, so we don't actually need to check
+// output.  However, writing a single function that does everything
+// means we get to reuse it for both forwards and all backwards
+// variants, even when the set of "real" inputs varies.  The magic of
+// relational computing!
+//
+// (There is one downside, which is that it is slightly harder to write
+// error messages which are able to distinguish between real inputs
+// (which the user can change) and computed inputs (which the user can
+// only indirectly affect).  It would be an interesting exercise to
+// come up with a general framework to handle such situations.)
+inline void convolution_shape_check(
+    CheckedFrom c,
+    const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
+{
+  check_args(c, padding, input->dim() - 2, "padding");
+  check_args(c, stride, padding.size(), "stride");
+  check_args(c, dilation, padding.size(), "dilation");
+
+  // Input
+  checkDimRange(c, input, 3, 6 /* exclusive */);
+  checkSize_symint(c, input, input_channels_dim, weight->size(1) * groups);
+
+  // Weight
+  checkSameDim(c, input, weight);
+
+  // TODO: check that output->size() matches output_sizes
+  // TODO: check that weight matches output->sizes()
+  checkSameDim(c, input, output);
+}
+
+// NB: conv_output_size and conv_input_size are not bijections,
+// as conv_output_size loses information; this is why conv_input_size
+// takes an extra output_padding argument to resolve the ambiguity.
+
+template <typename T>
+inline std::vector<T> _conv_output_size(
+    ArrayRef<T> input_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, ArrayRef<T> stride, ArrayRef<T> dilation = ArrayRef<T>()
+) {
+  // ASSERT(input_size.size() > 2)
+  // ASSERT(input_size.size() == weight_size.size())
+  bool has_dilation = !dilation.empty();
+  auto dim = input_size.size();
+  std::vector<T> output_size(dim);
+  output_size[0] = input_size[input_batch_size_dim];
+  output_size[1] = weight_size[weight_output_channels_dim];
+  for (const auto d : c10::irange(2, dim)) {
+    auto dilation_ = has_dilation ? dilation[d - 2] : 1;
+    auto kernel = dilation_ * (weight_size[d] - 1) + 1;
+    output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1;
+  }
+  return output_size;
+}
+
+inline std::vector<int64_t> conv_output_size(
+    IntArrayRef input_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+
+inline std::vector<c10::SymInt> conv_output_size(
+    SymIntArrayRef input_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, SymIntArrayRef stride, SymIntArrayRef dilation = SymIntArrayRef()
+) {
+  return _conv_output_size(input_size, weight_size, padding, stride, dilation);
+}
+
+template <typename T>
+std::vector<T> _conv_input_size(
+    ArrayRef<T> output_size, ArrayRef<T> weight_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, ArrayRef<T> stride, ArrayRef<T> dilation, T groups
+) {
+  // ASSERT(output_size.size() > 2)
+  // ASSERT(output_size.size() == weight_size.size())
+  auto dim = output_size.size();
+  std::vector<T> input_size(dim);
+  input_size[0] = output_size[output_batch_size_dim];
+  input_size[1] = weight_size[weight_input_channels_dim] * groups;
+  for (const auto d : c10::irange(2, dim)) {
+    auto kernel = (weight_size[d] - 1) * dilation[d - 2] + 1;
+    input_size[d] = (output_size[d] - 1) * stride[d - 2] - (padding[d - 2] * 2) +
+                     kernel + output_padding[d - 2];
+  }
+  return input_size;
+}
+
+inline std::vector<c10::SymInt> conv_input_size(
+    SymIntArrayRef output_size, SymIntArrayRef weight_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, std::move(groups));
+}
+
+inline std::vector<int64_t> conv_input_size(
+    IntArrayRef output_size, IntArrayRef weight_size,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
+}
+
+template <typename T>
+std::vector<T> _conv_weight_size(
+    ArrayRef<T> input_size, ArrayRef<T> output_size,
+    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  auto dim = input_size.size();
+  std::vector<T> weight_size(dim);
+  weight_size[0] = output_size[1];
+  weight_size[1] = input_size[1] / groups;
+  for (const auto d : c10::irange(2, dim)) {
+    auto kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
+               + padding[d - 2] * 2 - output_padding[d - 2];
+    weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
+  }
+  return weight_size;
+}
+
+inline std::vector<c10::SymInt> conv_weight_size(
+    SymIntArrayRef input_size, SymIntArrayRef output_size,
+    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+
+inline std::vector<int64_t> conv_weight_size(
+    IntArrayRef input_size, IntArrayRef output_size,
+    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+) {
+  return _conv_weight_size(input_size, output_size, padding, output_padding, stride, dilation, groups);
+}
+
+inline Tensor reshape_bias(int64_t dim, const Tensor& bias) {
+  std::vector<int64_t> shape(dim, 1);
+  shape[1] = -1;
+  return bias.reshape(shape);
+}
+
+inline at::MemoryFormat cudnn_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
+  // disable NHWC for float64 input.
+  if (!at::detail::getCUDAHooks().compiledWithCuDNN() ||
+      input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return at::MemoryFormat::Contiguous;
+  }
+  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+  auto weight_ndim = weight.ndimension();
+
+  bool can_use_cudnn_channels_last_2d = (cudnn_version >= 7603) && (weight_ndim == 4) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast)
+  );
+  if (can_use_cudnn_channels_last_2d) {
+    return at::MemoryFormat::ChannelsLast;
+  }
+
+  bool can_use_cudnn_channels_last_3d = (cudnn_version >= 8005) && (weight_ndim == 5) && (
+    (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+    (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
+  );
+  if (can_use_cudnn_channels_last_3d) {
+    return at::MemoryFormat::ChannelsLast3d;
+  }
+
+  return at::MemoryFormat::Contiguous;
+}
+
+// controls whether emptyCache will be called following cudnn conv benchmarking
+TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable);
+TORCH_API bool _cudnn_get_conv_benchmark_empty_cache();
+
+
+inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // disable NHWC for float64 input.
+  if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
+      input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+
+  bool can_use_miopen_channels_last_2d = false;
+  // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
+  // See #64427
+  static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC &&  *PYTORCH_MIOPEN_SUGGEST_NHWC && (
+            ( (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+            (weight_memory_format == at::MemoryFormat::ChannelsLast) )
+        );
+
+  bool can_use_miopen_channels_last_3d = false;
+
+  return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
+}
+
+inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // disable NHWC for float64 input.
+  if (input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+
+  // disable NHWC for MkldnnCPU tensor.
+  if (input.is_mkldnn() || weight.is_mkldnn()) {
+    return false;
+  }
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  bool can_use_mkldnn_channels_last_2d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast);
+
+  bool can_use_mkldnn_channels_last_3d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast3d);
+
+  return can_use_mkldnn_channels_last_2d || can_use_mkldnn_channels_last_3d;
+}
+
+inline bool thnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  bool can_use_thnn_channels_last_2d = input.device().is_cpu() && (
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) || (
+       weight_memory_format == at::MemoryFormat::ChannelsLast));
+
+  return can_use_thnn_channels_last_2d;
+}
+
+inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // check layout only for xpu tensor.
+  if (!input.is_xpu() || !weight.is_xpu()) {
+    return false;
+  }
+
+  // disable NHWC for float64 input.
+  if (input.scalar_type() == at::kDouble ||
+      weight.scalar_type() == at::kDouble) {
+    return false;
+  }
+
+  auto input_memory_format = input.suggest_memory_format();
+  auto weight_memory_format = weight.suggest_memory_format();
+
+  bool can_use_xpu_channels_last_2d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast);
+
+  bool can_use_xpu_channels_last_3d =
+      (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
+      (weight_memory_format == at::MemoryFormat::ChannelsLast3d);
+
+  return can_use_xpu_channels_last_2d || can_use_xpu_channels_last_3d;
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvolutionMM3d.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvolutionMM3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..3de6763015c6616599a604ee169dacc55985a385
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ConvolutionMM3d.h
@@ -0,0 +1,14 @@
+#include <ATen/core/Tensor.h>
+
+namespace at::native {
+
+std::tuple<Tensor, Tensor, Tensor> slow_conv3d_backward_cpu(
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    std::array<bool, 3> output_mask);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Copy.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..14abb32fa5ad4ba3cd8c78084569b313a4a692cd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Copy.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+
+class Tensor;
+struct TensorIterator;
+class TensorBase;
+
+namespace native {
+
+using copy_fn = void (*)(TensorIterator&, bool non_blocking);
+
+DECLARE_DISPATCH(copy_fn, copy_stub);
+
+TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h
new file mode 100644
index 0000000000000000000000000000000000000000..9daee7f2d6c43586630ad38ac731b22ce2416ff3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Cross.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+class Tensor;
+
+namespace native {
+
+using cross_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const int64_t d);
+
+DECLARE_DISPATCH(cross_fn, cross_stub);
+
+}} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd580020374a66aa058938e1186fbfd577a76980
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DilatedConvolutionUtils.h
@@ -0,0 +1,229 @@
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include <ATen/div_rtn.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+
+#define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \
+  TORCH_CHECK(                                       \
+      T.dim() == DIM && T.size(DIM_SIZE) == SIZE,    \
+      "Need " #T " of dimension ",                   \
+      DIM,                                           \
+      " and " #T ".size[",                           \
+      DIM_SIZE,                                      \
+      "] == ",                                       \
+      SIZE,                                          \
+      " but got input to be of shape ",              \
+      T.sizes())
+
+namespace at::native::internal {
+namespace {
+inline bool all_positive(IntArrayRef& arr) {
+  return std::all_of(
+      arr.begin(), arr.end(), [](int64_t item) { return item > 0; });
+}
+
+inline bool all_nonnegative(std::vector<int64_t>& arr) {
+  return std::all_of(
+      arr.begin(), arr.end(), [](int64_t item) { return item >= 0; });
+}
+
+} // namespace
+
+// calculate the rear part of output tensor sizes
+template <int64_t dim>
+std::vector<int64_t> get_output_size(
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  std::vector<int64_t> sizes;
+  for (const auto index : c10::irange(dim)) {
+    sizes.push_back(
+        div_rtn<int64_t>(
+            input.size(index + input.dim() - dim) + 2 * pad_size[index] -
+                (dilation_size[index] * (kernel_size[index] - 1) + 1),
+            stride_size[index]) +
+        1);
+  }
+  return sizes;
+}
+
+// calculate the sizes of output tensor
+template <int64_t dim>
+std::vector<int64_t> get_output_size(
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  auto output_size = get_output_size<dim>(
+      input, kernel_size, stride_size, pad_size, dilation_size);
+  output_size.insert(output_size.begin(), weight.size(0));
+  if (input.dim() == dim + 2) {
+    output_size.insert(output_size.begin(), input.size(0));
+  }
+  return output_size;
+}
+/*
+  slow_conv_dilated_shape_check - check user-input to dilated convolution
+  forward and backward functions.
+*/
+template <int64_t dim>
+void slow_conv_dilated_shape_check(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    const Tensor& grad_output,
+    IntArrayRef kernel_size,
+    IntArrayRef stride_size,
+    IntArrayRef pad_size,
+    IntArrayRef dilation_size) {
+  /*
+    When the following tensors are defined:
+
+    bias, grad_weight, grad_output
+
+    then these are assumed to be contiguous without checking
+    because of these tensors are made contiguous by calling
+    .contiguous() method or by resizing of zero-sized tensors in
+    forward/backward functions.
+
+    When grad_weight is defined then it is assumed without
+    checking to have the same shape as weight, see backward
+    functions.
+   */
+  // Check size arguments
+  TORCH_CHECK(
+      kernel_size.size() == dim,
+      "kernel sizes length should be ",
+      dim,
+      ", but got ",
+      kernel_size.size());
+  TORCH_CHECK(
+      stride_size.size() == dim,
+      "strides length should be ",
+      dim,
+      ", but got ",
+      stride_size.size());
+  TORCH_CHECK(
+      dilation_size.size() == dim,
+      "dilations length should be ",
+      dim,
+      ", but got ",
+      dilation_size.size());
+  TORCH_CHECK(
+      pad_size.size() == dim,
+      "pads length should be ",
+      dim,
+      ", but got ",
+      pad_size.size());
+
+  TORCH_CHECK(
+      all_positive(kernel_size),
+      "kernel size should be greater than zero, but got ",
+      kernel_size);
+  TORCH_CHECK(
+      all_positive(stride_size),
+      "stride should be greater than zero, but got ",
+      stride_size);
+  TORCH_CHECK(
+      all_positive(dilation_size),
+      "dilation should be greater than zero, but got ",
+      dilation_size);
+
+  // check input
+  TORCH_CHECK(input.defined(), "input must be defined");
+  bool is_batch = input.dim() == dim + 2;
+  int64_t n = (is_batch ? 2 : 1);
+  int64_t ndim = n + dim;
+  if (!is_batch) {
+    // input dim has to be dim + 1 if not batched
+    TORCH_CHECK(
+        input.dim() == dim + 1,
+        "input must be 4D or 5D tensor but got ",
+        input.dim(),
+        "D tensor");
+  }
+
+  // check output sizes
+  auto output_size = get_output_size<dim>(
+      input, kernel_size, stride_size, pad_size, dilation_size);
+
+  TORCH_CHECK(
+      all_nonnegative(output_size),
+      "calculated output size ",
+      output_size,
+      " is too small (all sizes must be non-negative)");
+
+  // check weight
+  TORCH_CHECK(weight.defined(), "weight must be defined");
+  TORCH_CHECK(
+      weight.dim() == dim + 2,
+      "weight must be ",
+      dim + 2,
+      "D tensor but got ",
+      weight.dim(),
+      "D tensor dim=",
+      dim);
+  TORCH_CHECK(
+      weight.sizes().slice(2) == kernel_size,
+      "weight[2:] shape ",
+      weight.sizes().slice(2),
+      " must be equal to kernel_size ",
+      kernel_size);
+
+  TORCH_CHECK_DIM_SIZE(input, input.dim(), (is_batch ? 1 : 0), weight.size(1));
+
+  // check bias when present
+  if (bias.defined()) {
+    TORCH_CHECK(
+        bias.dim() == 1,
+        "bias must be 1D tensor but got ",
+        bias.dim(),
+        "D tensor");
+    TORCH_CHECK_DIM_SIZE(bias, 1, 0, weight.size(0));
+  }
+
+  // check grad_output when present
+  if (grad_output.defined()) {
+    TORCH_CHECK(
+        grad_output.dim() == ndim,
+        "grad_output must be ",
+        ndim,
+        "D tensor but got ",
+        grad_output.dim(),
+        "D tensor");
+    if (is_batch) {
+      TORCH_CHECK(
+          grad_output.size(0) == input.size(0),
+          "grad_output.size(0)=",
+          grad_output.size(0),
+          " must be input.size(0)=",
+          input.size(0));
+    }
+    TORCH_CHECK(
+        grad_output.size(n - 1) == weight.size(0),
+        "grad_output.size(",
+        n - 1,
+        ")=",
+        grad_output.size(n - 1),
+        " must be weight.size(0)=",
+        weight.size(0));
+    TORCH_CHECK(
+        grad_output.sizes().slice(n) == output_size,
+        "grad_output[",
+        n,
+        ":] shape",
+        grad_output.sizes().slice(n),
+        " must be equal to output size ",
+        output_size);
+  }
+}
+
+} // namespace at::native::internal
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DispatchStub.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DispatchStub.h
new file mode 100644
index 0000000000000000000000000000000000000000..641b779701d1e207bd709faa77191816fa8bc1b8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DispatchStub.h
@@ -0,0 +1,444 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Array.h>
+
+#include <atomic>
+#include <utility>
+#include <variant>
+
+// Implements instruction set specific function dispatch.
+//
+// Kernels that may make use of specialized instruction sets (e.g. AVX2) are
+// compiled multiple times with different compiler flags (e.g. -mavx2). A
+// DispatchStub contains a table of function pointers for a kernel. At runtime,
+// the fastest available kernel is chosen based on the features reported by
+// cpuinfo.
+//
+// Example:
+//
+// In native/MyKernel.h:
+//   using fn_type = void(*)(const Tensor& x);
+//   DECLARE_DISPATCH(fn_type, stub);
+//
+// In native/MyKernel.cpp
+//   DEFINE_DISPATCH(stub);
+//
+// In native/cpu/MyKernel.cpp:
+//   namespace {
+//     // use anonymous namespace so that different cpu versions won't conflict
+//     void kernel(const Tensor& x) { ... }
+//   }
+//   REGISTER_DISPATCH(stub, &kernel);
+//
+// To call:
+//   stub(kCPU, tensor);
+//
+// TODO: CPU instruction set selection should be folded into whatever
+// the main dispatch mechanism is.
+//
+// Supported device types for registration:
+//   - CPU: Central Processing Unit
+//   - CUDA: NVIDIA GPUs
+//   - HIP: AMD GPUs
+//   - MPS: Apple Silicon GPUs (Metal Performance Shaders)
+//   - MTIA: Meta Training and Inference Devices
+//   - XPU: Intel GPUs
+//   - PrivateUse1: Reserved for private/custom device types
+//
+// If you want to update the list of supported devices, add a new dispatch_ptr
+// member in DispatchStubImpl.h and update the get_call_ptr switch.
+// As well you will need to update the inlined list in 'is_device_supported`
+//
+//
+// ignore warnings about DispatchStub::DEFAULT, AVX, AVX2 defined elsewhere
+C10_CLANG_DIAGNOSTIC_PUSH()
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wundefined-var-template")
+
+namespace at::native {
+
+enum class CPUCapability {
+  DEFAULT = 0,
+#if defined(HAVE_VSX_CPU_DEFINITION)
+  VSX = 1,
+#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
+  ZVECTOR = 1,
+#else
+  AVX2 = 1,
+  AVX512 = 2,
+#endif
+  NUM_OPTIONS
+};
+
+// Enum for error types
+enum class ErrorType {
+  MissingDeviceKernel,
+  DeviceNotSupported
+};
+
+// Alias for the return type using std::variant
+using DispatchResult = std::variant<void*, ErrorType>;
+
+CPUCapability get_cpu_capability();
+
+template <typename FnPtr, typename T>
+struct DispatchStub;
+
+/**
+ * The sole purpose of this class is to outline methods that don't need to be
+ * specialized or otherwise inlined and duplicated (by the compiler due to
+ * template expansion), since it causes size bloat if there are a significant
+ * number of specialization of the DispatchStub<> class.
+ */
+struct TORCH_API DispatchStubImpl {
+
+  // The DispatchStubImpl::try_get_call_ptr() method is used to get the call
+  // pointer for a given device type. If the call pointer is not found,
+  // DispatchStubImpl::try_get_call_ptr() returns an ErrorType.
+  // The main difference between try_get_call_ptr() and get_call_ptr() is that
+  // try_get_call_ptr() will return the ErrorType and not raise an exception.
+  DispatchResult try_get_call_ptr(
+    c10::DeviceType device_type
+    , void *DEFAULT
+#ifdef HAVE_AVX512_CPU_DEFINITION
+      , void *AVX512
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+      , void *AVX2
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+      , void *VSX
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+      , void *ZVECTOR
+#endif
+  );
+
+  // Analogous to try_get_call_ptr(), but it will return the ErrorType and not
+  // raise an exception.
+  DispatchResult try_choose_cpu_impl(
+    void *DEFAULT
+#ifdef HAVE_AVX512_CPU_DEFINITION
+    , void *AVX512
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+    , void *AVX2
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+    , void *VSX
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+    , void *ZVECTOR
+#endif
+  );
+
+
+  void* get_call_ptr(
+    c10::DeviceType device_type
+    , void *DEFAULT
+#ifdef HAVE_AVX512_CPU_DEFINITION
+      , void *AVX512
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+      , void *AVX2
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+      , void *VSX
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+      , void *ZVECTOR
+#endif
+  );
+
+  /**
+   * The CPU Dispatch actual method is chosen in decreasing order of preference by
+   * DispatchStubImpl::choose_cpu_impl() in case none is found by
+   * DispatchStubImpl::get_call_ptr() in cpu_dispatch_ptr.
+   */
+  void* choose_cpu_impl(
+    void *DEFAULT
+#ifdef HAVE_AVX512_CPU_DEFINITION
+    , void *AVX512
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+    , void *AVX2
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+    , void *VSX
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+    , void *ZVECTOR
+#endif
+  );
+
+  // Fixing dispatch error in Windows debug builds.
+  // See https://github.com/pytorch/pytorch/issues/22681 for more details.
+  #if defined(_MSC_VER) && defined(_DEBUG)
+    std::atomic<void*> cpu_dispatch_ptr;
+    void* cuda_dispatch_ptr;
+    void* hip_dispatch_ptr;
+    void* mps_dispatch_ptr;
+    void* mtia_dispatch_ptr;
+  #if defined(USE_XPU)
+    void* xpu_dispatch_ptr;
+  #endif
+    void* privateuse1_dispatch_ptr;
+  #else
+    std::atomic<void*> cpu_dispatch_ptr{nullptr};
+    void* cuda_dispatch_ptr = nullptr;
+    void* hip_dispatch_ptr = nullptr;
+    void* mps_dispatch_ptr = nullptr;
+    void* mtia_dispatch_ptr = nullptr;
+  #if defined(USE_XPU)
+    void* xpu_dispatch_ptr = nullptr;
+  #endif
+    void* privateuse1_dispatch_ptr = nullptr;
+  #endif
+};
+
+template <typename rT, typename T, typename... Args>
+struct DispatchStub<rT (*)(Args...), T> {
+  using FnPtr = rT (*) (Args...);
+
+  DispatchStub() = default;
+  DispatchStub(const DispatchStub&) = delete;
+  DispatchStub& operator=(const DispatchStub&) = delete;
+
+private:
+  FnPtr get_call_ptr(const c10::DeviceType device_type) {
+    return reinterpret_cast<FnPtr>(
+      impl.get_call_ptr(device_type
+      , reinterpret_cast<void*>(DEFAULT)
+#ifdef HAVE_AVX512_CPU_DEFINITION
+      , reinterpret_cast<void*>(AVX512)
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+      , reinterpret_cast<void*>(AVX2)
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+      , reinterpret_cast<void*>(VSX)
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+      , reinterpret_cast<void*>(ZVECTOR)
+#endif
+      )
+    );
+  }
+
+public:
+  template <typename... ArgTypes>
+  rT operator()(c10::DeviceType device_type, ArgTypes&&... args) {
+    FnPtr call_ptr = get_call_ptr(device_type);
+    return (*call_ptr)(std::forward<ArgTypes>(args)...);
+  }
+
+  void set_cuda_dispatch_ptr(FnPtr fn_ptr) {
+    impl.cuda_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+  #if defined(USE_XPU)
+  void set_xpu_dispatch_ptr(FnPtr fn_ptr){
+    impl.xpu_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+  #endif
+
+  void set_hip_dispatch_ptr(FnPtr fn_ptr) {
+    impl.hip_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+  void set_mps_dispatch_ptr(FnPtr fn_ptr) {
+    impl.mps_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+    void set_mtia_dispatch_ptr(FnPtr fn_ptr) {
+    impl.mtia_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+  void set_privateuse1_dispatch_ptr(FnPtr fn_ptr) {
+    impl.privateuse1_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
+  // Returns true if the dispatcher has a kernel registered for this device
+  // type.
+  bool is_device_supported(const c10::DeviceType device_type) {
+    auto result = impl.try_get_call_ptr(device_type
+      , reinterpret_cast<void*>(DEFAULT)
+#ifdef HAVE_AVX512_CPU_DEFINITION
+      , reinterpret_cast<void*>(AVX512)
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+      , reinterpret_cast<void*>(AVX2)
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+      , reinterpret_cast<void*>(VSX)
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+      , reinterpret_cast<void*>(ZVECTOR)
+#endif
+      );
+    if (std::holds_alternative<ErrorType>(result)){
+      return false;
+    }
+    return true;
+  };
+
+  static TORCH_API FnPtr DEFAULT;
+#ifdef HAVE_AVX512_CPU_DEFINITION
+  static TORCH_API FnPtr AVX512;
+#endif
+#ifdef HAVE_AVX2_CPU_DEFINITION
+  static TORCH_API FnPtr AVX2;
+#endif
+#ifdef HAVE_VSX_CPU_DEFINITION
+  static TORCH_API FnPtr VSX;
+#endif
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+  static TORCH_API FnPtr ZVECTOR;
+#endif
+private:
+  DispatchStubImpl impl;
+};
+
+namespace {
+template <typename DispatchStub>
+struct RegisterCUDADispatch {
+  RegisterCUDADispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+    stub.set_cuda_dispatch_ptr(value);
+  }
+};
+
+template <typename DispatchStub>
+struct RegisterXPUDispatch {
+  RegisterXPUDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value){
+    stub.set_xpu_dispatch_ptr(value);
+  }
+};
+
+template <typename DispatchStub>
+struct RegisterMPSDispatch {
+  RegisterMPSDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+    stub.set_mps_dispatch_ptr(value);
+  }
+};
+
+template <typename DispatchStub>
+struct RegisterHIPDispatch {
+  RegisterHIPDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+    // TODO: make this point at hip_dispatch_ptr
+    stub.set_cuda_dispatch_ptr(value);
+  }
+};
+
+template <typename DispatchStub>
+struct RegisterMTIADispatch {
+  RegisterMTIADispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+    stub.set_mtia_dispatch_ptr(value);
+  }
+};
+
+template <typename DispatchStub>
+struct RegisterPRIVATEUSE1Dispatch {
+  RegisterPRIVATEUSE1Dispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+    stub.set_privateuse1_dispatch_ptr(value);
+  }
+};
+
+} // anonymous namespace
+// Compiler will complain if you put things like std::tuple<Tensor, Tensor> in
+// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g.,
+// adding parentheses and using helper struct to get rid of the parentheses, do
+// not work with MSVC. So do a `using`-declaration if you need to pass in such
+// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h.
+#define DECLARE_DISPATCH(fn, name)                                                         \
+  struct name##_DECLARE_DISPATCH_type : DispatchStub<fn, name##_DECLARE_DISPATCH_type> {   \
+    name##_DECLARE_DISPATCH_type() = default;                                              \
+    name##_DECLARE_DISPATCH_type(const name##_DECLARE_DISPATCH_type&) = delete;            \
+    name##_DECLARE_DISPATCH_type& operator=(const name##_DECLARE_DISPATCH_type&) = delete; \
+  };                                                                                       \
+  extern TORCH_API struct name##_DECLARE_DISPATCH_type name;
+
+#define DEFINE_DISPATCH(name) struct name##_DECLARE_DISPATCH_type name
+
+#define REGISTER_ARCH_DISPATCH(name, arch, fn) \
+  template <> name##_DECLARE_DISPATCH_type::FnPtr TORCH_API DispatchStub<name##_DECLARE_DISPATCH_type::FnPtr, struct name##_DECLARE_DISPATCH_type>::arch = fn;
+
+#ifdef HAVE_AVX512_CPU_DEFINITION
+#define REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX512, fn)
+#else
+#define REGISTER_AVX512_DISPATCH(name, fn)
+#endif
+
+#ifdef HAVE_AVX2_CPU_DEFINITION
+#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn)
+#else
+#define REGISTER_AVX2_DISPATCH(name, fn)
+#endif
+
+#ifdef HAVE_VSX_CPU_DEFINITION
+#define REGISTER_VSX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, VSX, fn)
+#else
+#define REGISTER_VSX_DISPATCH(name, fn)
+#endif
+
+#ifdef HAVE_ZVECTOR_CPU_DEFINITION
+#define REGISTER_ZVECTOR_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, ZVECTOR, fn)
+#else
+#define REGISTER_ZVECTOR_DISPATCH(name, fn)
+#endif
+
+// Macro to register the same kernel for all CPU arch types. This is useful
+// if a kernel does not benefit from being recompiled across different arch types.
+#define REGISTER_ALL_CPU_DISPATCH(name, fn)                                    \
+  REGISTER_ARCH_DISPATCH(name, DEFAULT, fn)                                    \
+  REGISTER_AVX512_DISPATCH(name, fn)                                           \
+  REGISTER_AVX2_DISPATCH(name, fn)                                             \
+  REGISTER_VSX_DISPATCH(name, fn)                                              \
+  REGISTER_ZVECTOR_DISPATCH(name, fn)
+
+#define REGISTER_NO_CPU_DISPATCH(name)                                         \
+  REGISTER_ALL_CPU_DISPATCH(name, nullptr)
+
+#define REGISTER_CUDA_DISPATCH(name, fn) \
+  static RegisterCUDADispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
+
+#define REGISTER_XPU_DISPATCH(name, fn) \
+  static RegisterXPUDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
+
+#define REGISTER_HIP_DISPATCH(name, fn) \
+  static RegisterHIPDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
+
+#define REGISTER_MPS_DISPATCH(name, fn) \
+  static RegisterMPSDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
+
+#define REGISTER_MTIA_DISPATCH(name, fn) \
+  static RegisterMTIADispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
+
+#define REGISTER_PRIVATEUSE1_DISPATCH(name, fn) \
+  static RegisterPRIVATEUSE1Dispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
+
+// NB: This macro must be used in an actual 'cu' file; if you try using
+// it from a 'cpp' file it will not work!
+#if defined(__CUDACC__)
+#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
+#elif defined(__HIPCC__)
+// TODO: cut this over to HIP dispatch once we stop pretending that CUDA
+// is HIP in the PyTorch HIPify build.
+#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
+// #define REGISTER_DISPATCH(name, fn) REGISTER_HIP_DISPATCH(name, fn)
+#elif defined(__OBJC__) && defined(USE_MPS)
+// NB: this macro must be used from a 'mm' file in order to dispatch a MPS kernel
+#define REGISTER_DISPATCH(name, fn) REGISTER_MPS_DISPATCH(name, fn)
+#elif defined(CPU_CAPABILITY)
+// REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
+// ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
+#ifdef CPU_CAPABILITY_AVX512
+#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr))
+#else
+#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
+#endif
+#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
+#endif
+} // namespace at::native
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h
new file mode 100644
index 0000000000000000000000000000000000000000..38c171e56dfae97f5162a5b70e6ceedd0a91db7d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DistributionTemplates.h
@@ -0,0 +1,394 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/Generator.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/Tensor.h>
+#include <ATen/MemoryOverlap.h>
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/native/Resize.h>
+#include <ATen/native/TensorIterator.h>
+#include <cmath>
+#include <limits>
+#include <optional>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/view_as_real.h>
+#endif
+
+namespace at::native::templates {
+
+// ==================================================== Random ========================================================
+
+// The purpose of `update_from` and `update_to` is to find the closest valid int64_t number that can be used as actual `from`.
+// The current implementation of `random_` uses uint64_t arithmetics and casts the result to the target dtype(scalar_t).
+// This casting can result in generating numbers that happen to be greater or equal to `to` value. For instance:
+//
+//    auto actual = torch::empty({3, 3}, torch::half);
+//    actual.random_(0, 65504);
+//
+// If random's uint64_t arithmetics produces 65503 as a random value after casting to torch::half it becomes 65504
+// and violates the requirement that random value must be less than `to`. To resolve this issue `update_from` and `update_to`
+// moves `from` to the right and `to` to the left to the next closest value that won't go outside [from, to) after casting to
+// the target dtype. For `to` = 65504 it moves left for (1 << (log2(to) - 11 + 1)) = 32 and becomes 65472, which is previous
+// available number for torch::half dtype.
+template<typename scalar_t>
+int64_t update_from(int64_t from) {
+  static_assert(
+    std::is_floating_point<scalar_t>::value ||
+    std::is_same<scalar_t, at::Half>::value ||
+    std::is_same<scalar_t, at::BFloat16>::value, "scalar_t must be floating-point type");
+  const auto from_plus_1 = static_cast<int64_t>(static_cast<scalar_t>(from + 1));
+  if (from_plus_1 < from) {
+    int64_t from_ = std::abs(from + 1);
+    int n = 0;
+    while (from_ >>= 1) ++n;
+    // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult)
+    from = from_plus_1 + (1LL << (n - std::numeric_limits<scalar_t>::digits + 1));
+  }
+  return from;
+}
+
+template<typename scalar_t>
+int64_t update_to(int64_t to) {
+  static_assert(
+    std::is_floating_point<scalar_t>::value ||
+    std::is_same<scalar_t, at::Half>::value ||
+    std::is_same<scalar_t, at::BFloat16>::value, "scalar_t must be floating-point type");
+  const auto to_minus_1 = static_cast<int64_t>(static_cast<scalar_t>(to - 1));
+  if (to_minus_1 >= to) {
+    int64_t to_ = std::abs(to - 1);
+    int n = 0;
+    while (to_ >>= 1) ++n;
+    // NOLINTNEXTLINE(clang-analyzer-core.UndefinedBinaryOperatorResult)
+    to = to_minus_1 - (1LL << (n - std::numeric_limits<scalar_t>::digits + 1));
+  }
+  return to;
+}
+
+// Return earlier for not invoking kernel.
+// See https://github.com/pytorch/pytorch/issues/103418 for more details
+#define CHECK_EMPTY_AND_RETURN(tensor) \
+  if (tensor.numel() == 0) {  \
+    return tensor;  \
+  }
+
+template<template<typename> class random_kernel, typename RNG>
+at::Tensor& random_impl(at::Tensor& self, std::optional<Generator> generator) {
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = at::TensorIterator::borrowing_nullary_op(self);
+  random_kernel<RNG>()(iter, generator);
+  return self;
+}
+
+#define CHECK_OUT_OF_BOUNDS(var, name, min, max, dtype) \
+  TORCH_CHECK(var >= min && var <= max, name , " is out of bounds for ", dtype); \
+
+#define WARN_OUT_OF_BOUNDS(var, name, digits, dtype) \
+  if (var < -(1LL << digits) || var > (1LL << digits)) { \
+    TORCH_WARN(name , " is out of bounds [-(2^", digits, "), 2^", digits, "]. ", \
+      "Due to precision limitations ", dtype, " can support discrete uniform distribution only within this range. ", \
+      "This warning will become an error in version 1.7 release, please fix the code in advance"); \
+  }
+
+inline void check_from_to_in_range(int64_t from, int64_t to_inc, caffe2::TypeMeta dtype) {
+  const auto scalar_type = typeMetaToScalarType(dtype);
+  if (isFloatingType(scalar_type)) {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "check_random_fp_bounds", [&] {
+      const auto min = static_cast<double>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<double>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype);
+
+      constexpr auto digits = std::numeric_limits<scalar_t>::digits;
+      WARN_OUT_OF_BOUNDS(from, "from", digits, dtype);
+      WARN_OUT_OF_BOUNDS(to_inc, "to - 1", digits, dtype);
+    });
+  } else if (scalar_type == kUInt64) {
+    // When you do a comparison between int64_t and uint64_t, the usual
+    // arithmetic conversions say that the int64_t value is promoted to
+    // unsigned. But this conversion wraps around: if I had -1 as my int64_t,
+    // then it will promote to 0xFFFFFFFFFFFFFFFF in uint64_t. This is never
+    // the right thing to do.
+    CHECK_OUT_OF_BOUNDS(from, "from", 0, INT64_MAX, dtype);
+    CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", 0, INT64_MAX, dtype);
+  } else if (isIntegralType(scalar_type, /*includeBool=*/true)) {
+    AT_DISPATCH_V2(scalar_type, "check_random_integral_bounds", AT_WRAP([&]() {
+      const auto min = static_cast<int64_t>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<int64_t>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to_inc, "to - 1", min, max, dtype);
+    }), AT_EXPAND(AT_INTEGRAL_TYPES), kUInt16, kUInt32, kBool);
+  } else {
+    TORCH_CHECK(false, "check_random_bounds handles only integral, floating-point and boolean types");
+  }
+}
+
+template<template<typename> class random_from_to_kernel, typename RNG>
+at::Tensor& random_from_to_impl(at::Tensor& self, int64_t from, std::optional<int64_t> to_opt, std::optional<Generator> generator) {
+  uint64_t range = 0;
+  auto iter = at::TensorIterator::borrowing_nullary_op(self);
+  if (to_opt.has_value()) {
+    // [from, to)
+    int64_t to = *to_opt;
+    TORCH_CHECK(from < to, "random_ expects 'from' to be less than 'to', but got from=", from, " >= to=", to);
+    if (isFloatingType(iter.dtype())) {
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_update_from_to", [&] {
+        from = update_from<scalar_t>(from);
+        to = update_to<scalar_t>(to);
+        TORCH_CHECK(from < to, "random_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", from, " >= to=", to);
+      });
+    }
+    check_from_to_in_range(from, to - 1, self.dtype());
+    CHECK_EMPTY_AND_RETURN(self);
+    range = static_cast<uint64_t>(to) - static_cast<uint64_t>(from);
+    random_from_to_kernel<RNG>()(iter, range, from, generator);
+  } else if (from != std::numeric_limits<int64_t>::lowest()) {
+    // [from, std::numeric_limits<int64_t>::max()]
+    int64_t to_inc = 0;
+    if (isFloatingType(iter.dtype())) {
+      AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "random_from_to_range_calc", [&] {
+        constexpr int64_t scalar_t_max = static_cast<int64_t>(1) << std::numeric_limits<scalar_t>::digits;
+        to_inc = scalar_t_max > std::numeric_limits<int64_t>::max() ? std::numeric_limits<int64_t>::max() : static_cast<int64_t>(scalar_t_max);
+        from = update_from<scalar_t>(from);
+        TORCH_CHECK(from < to_inc, "random_ expects 'from' casted to dtype to be less than or equal to 'to_inc' casted to dtype, but got from=", from, " > to_inc=", to_inc);
+      });
+    } else if (isIntegralType(iter.dtype(), /*includeBool=*/true)) {
+      AT_DISPATCH_V2(self.scalar_type(), "random_from_to_range_calc", AT_WRAP([&] {
+        if constexpr (std::is_same_v<scalar_t, bool>) {
+          to_inc = static_cast<int64_t>(true);
+        } else {
+          to_inc = static_cast<int64_t>(std::numeric_limits<scalar_t>::max());
+        }
+      }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), kBool);
+    } else {
+      TORCH_CHECK(false, "random_from_to_impl handles only integral, floating-point and boolean types");
+    }
+    check_from_to_in_range(from, to_inc, self.dtype());
+    CHECK_EMPTY_AND_RETURN(self);
+    range = static_cast<uint64_t>(to_inc) - static_cast<uint64_t>(from) + 1;
+    random_from_to_kernel<RNG>()(iter, range, from, generator);
+  } else {
+    // [std::numeric_limits<int64_t>::lowest(), std::numeric_limits<int64_t>::max()]
+    // range = 2^64
+    CHECK_EMPTY_AND_RETURN(self);
+    random_from_to_kernel<RNG>()(iter, generator);
+  }
+  return self;
+}
+
+// ==================================================== Normal ========================================================
+
+#define CHECK_NORMAL_TENSOR_STD(std) \
+  do { \
+    TORCH_CHECK( \
+      !std.is_complex(), \
+      "normal expects standard deviation to be non-complex"); \
+    TORCH_CHECK( \
+      std.numel() == 0 || std.is_meta() || std.min().ge(0).item<bool>(), \
+      "normal expects all elements of std >= 0.0"); \
+  } while (0)
+
+#define CHECK_NORMAL_STD(std) \
+  TORCH_CHECK(std >= 0.0, "normal expects std >= 0.0, but found std ", std);
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_impl_(Tensor& self, double mean, double std, std::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  CHECK_EMPTY_AND_RETURN(self);
+
+  if (self.is_complex()) {
+    auto float_tensor = at::view_as_real(self);
+    // variance for normal distribution of the real and imaginary values
+    // is half of the input variance
+    normal_kernel<RNG>()(float_tensor, mean, std/(std::sqrt(2)), gen);
+  } else {
+    normal_kernel<RNG>()(self, mean, std, gen);
+  }
+  return self;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, double std, std::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  auto std_tensor = at::empty_like(output, MemoryFormat::Contiguous);
+  auto shape = at::infer_size(mean.sizes(), std_tensor.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, std, gen);
+  output.add_(mean);
+  return output;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_out_impl(Tensor& output, double mean, const Tensor& std, std::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto mean_tensor = at::full({}, mean, output.options());
+  auto shape = at::infer_size(mean_tensor.sizes(), std.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
+  // CUDA NB: addcmul_out copies the tensor to be added into the output.
+  // The previous function here was addcmul_out(output, mean_tensor, output, std, 1);
+  // The third argument is not a constant reference and hence the samples in output are overwritten.
+  // Consequently, the computation performed is mean_tensor + mean_tensor * std instead of mean_tensor + output * std
+  output.mul_(std).add_(mean_tensor);
+  return output;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor& normal_out_impl(Tensor& output, const Tensor& mean, const Tensor& std, std::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto shape = at::infer_size(mean.sizes(), std.sizes());
+  at::native::resize_output(output, shape);
+  normal_impl_<normal_kernel, RNG>(output, 0, 1, gen);
+  // CUDA NB: addcmul_out copies the tensor to be added into the output.
+  // The previous function here was addcmul_out(output, mean, output, std, 1);
+  // The third argument is not a constant reference and hence the samples in output are overwritten.
+  // Consequently, the computation performed is mean + mean * std instead of mean + output * std
+  output.mul_(std).add_(mean);
+  return output;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor normal_impl(const Tensor& mean, double std, std::optional<Generator> gen) {
+  CHECK_NORMAL_STD(std);
+  Tensor ret = at::empty_like(mean, MemoryFormat::Contiguous);
+  normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
+  return ret;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor normal_impl(double mean, const Tensor& std, std::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  Tensor ret = at::empty_like(std, MemoryFormat::Contiguous);
+  normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
+  return ret;
+}
+
+template<template<typename> class normal_kernel, typename RNG>
+Tensor normal_impl(const Tensor& mean, const Tensor& std, std::optional<Generator> gen) {
+  CHECK_NORMAL_TENSOR_STD(std);
+  auto shape = at::infer_size(mean.sizes(), std.sizes());
+  Tensor ret = at::empty(shape, mean.options(), MemoryFormat::Contiguous);
+  normal_out_impl<normal_kernel, RNG>(ret, mean, std, gen);
+  return ret;
+}
+
+// ==================================================== Uniform =======================================================
+
+template<template<typename> class uniform_kernel, typename RNG>
+at::Tensor& uniform_impl_(at::Tensor& self, double from, double to, std::optional<Generator> generator) {
+  if (self.is_complex()) {
+    CHECK_EMPTY_AND_RETURN(self);
+    auto float_tensor = at::view_as_real(self);
+    uniform_impl_<uniform_kernel, RNG>(float_tensor, from, to, generator);
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "check_uniform_bounds", [&] {
+      [[maybe_unused]] const auto dtype = self.dtype();
+      const auto min = static_cast<double>(std::numeric_limits<scalar_t>::lowest());
+      const auto max = static_cast<double>(std::numeric_limits<scalar_t>::max());
+      CHECK_OUT_OF_BOUNDS(from, "from", min, max, dtype);
+      CHECK_OUT_OF_BOUNDS(to, "to", min, max, dtype);
+      TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to);
+      TORCH_CHECK((to - from) <= std::numeric_limits<scalar_t>::max(),
+            "uniform_ expects to-from <= std::numeric_limits<", toString(self.scalar_type()),
+            ">::max(), but found to=", to, " and from=", from,
+            " which result in to-from to exceed the limit");
+      from = std::min(std::max(from, min), max);
+      to = std::max(std::min(to, max), min);
+    });
+    CHECK_EMPTY_AND_RETURN(self);
+    auto iter = at::TensorIterator::borrowing_nullary_op(self);
+    uniform_kernel<RNG>()(iter, from, to, generator);
+  }
+  return self;
+}
+
+// ================================================== LogNormal =======================================================
+
+template<template<typename> class log_normal_kernel, typename RNG>
+at::Tensor& log_normal_impl_(at::Tensor& self, double mean, double std, std::optional<Generator> gen) {
+  TORCH_CHECK(std > 0.0, "log_normal_ expects std > 0.0, but found std=", std);
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  log_normal_kernel<RNG>()(iter, mean, std, gen);
+  return self;
+}
+
+// =================================================== Geometric ======================================================
+
+template<template<typename> class geometric_kernel, typename RNG>
+Tensor& geometric_impl_(Tensor& self, double p, std::optional<Generator> gen) {
+  TORCH_CHECK(0 < p && p < 1, "geometric_ expects p to be in (0, 1), but got p=", p);
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  geometric_kernel<RNG>()(iter, p, gen);
+  return self;
+}
+
+// ================================================== Exponential =====================================================
+
+template<template<typename> class exponential_kernel, typename RNG>
+Tensor& exponential_impl_(Tensor& self, double lambda, std::optional<Generator> gen) {
+  TORCH_CHECK(lambda > 0.0, "exponential_ expects lambda > 0.0, but found lambda=", lambda);
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  exponential_kernel<RNG>()(iter, lambda, gen);
+  return self;
+}
+
+// ==================================================== Cauchy ========================================================
+
+template<template<typename> class cauchy_kernel, typename RNG>
+Tensor& cauchy_impl_(Tensor& self, double median, double sigma, std::optional<Generator> gen) {
+  // TODO: instead of variable name 'sigma', use 'gamma' or 'scale'
+  // the variance, squared sigma, is undefined for cauchy distribution
+  TORCH_CHECK(sigma > 0.0, "cauchy_ expects sigma > 0.0, but found sigma=", sigma);
+  TORCH_CHECK(at::isFloatingType(self.scalar_type()), "Cauchy distribution is a continuous probability distribution. dtype must be a floating point but you specified ", self.dtype());
+  CHECK_EMPTY_AND_RETURN(self);
+  auto iter = TensorIterator::borrowing_nullary_op(self);
+  cauchy_kernel<RNG>()(iter, median, sigma, gen);
+  return self;
+}
+
+// ==================================================== Bernoulli =====================================================
+
+template<template<typename> class bernoulli_tensor_kernel, typename RNG>
+Tensor& bernoulli_impl_(Tensor& self, const Tensor& p_, std::optional<Generator> gen) {
+  CHECK_EMPTY_AND_RETURN(self);
+  NoNamesGuard guard;
+  at::assert_no_internal_overlap(self);
+  bernoulli_tensor_kernel<RNG>()(self, p_, gen);
+  return self;
+}
+
+template<template<typename> class bernoulli_scalar_kernel, typename RNG>
+Tensor& bernoulli_impl_(Tensor& self, double p, std::optional<Generator> gen) {
+  TORCH_CHECK(0 <= p && p <= 1, "bernoulli_ expects p to be in [0, 1], but got p=", p);
+  CHECK_EMPTY_AND_RETURN(self);
+  at::assert_no_internal_overlap(self);
+  bernoulli_scalar_kernel<RNG>()(self, p, gen);
+  return self;
+}
+
+template<template<typename> class bernoulli_tensor_kernel, typename RNG>
+Tensor& bernoulli_out_impl(Tensor& result, const Tensor& self, std::optional<Generator> gen) {
+  // result.resize_as_(self) requires self to have same dtype as result, so we
+  // use resize_ instead.
+  // TODO: Fix resize_as_. See pytorch/pytorch#11665.
+  result.resize_(self.sizes());
+  bernoulli_impl_<bernoulli_tensor_kernel, RNG>(result, self, gen);
+  namedinference::propagate_names(result, self);
+  return result;
+}
+
+#undef CHECK_OUT_OF_BOUNDS
+#undef WARN_OUT_OF_BOUNDS
+
+} // namespace at::native::templates
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distributions.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distributions.h
new file mode 100644
index 0000000000000000000000000000000000000000..664e2db3b2dc5b2370929729649da6f4f4086c97
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distributions.h
@@ -0,0 +1,518 @@
+#pragma once
+
+#include <ATen/native/Math.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/MathConstants.h>
+
+// ROCM hcc doesn't work well with using std:: in kernel functions
+#if defined(__CUDA_ARCH__)
+#include <c10/cuda/CUDAMathCompat.h>
+#define compat_exp c10::cuda::compat::exp
+#define compat_ceil c10::cuda::compat::ceil
+#define compat_floor c10::cuda::compat::floor
+#define compat_log c10::cuda::compat::log
+#define compat_pow c10::cuda::compat::pow
+#define compat_sqrt c10::cuda::compat::sqrt
+#define compat_tan c10::cuda::compat::tan
+#define compat_abs c10::cuda::compat::abs
+#define compat_log1p c10::cuda::compat::log1p
+#elif defined(__HIPCC__)
+#include <c10/hip/HIPMathCompat.h>
+#define compat_exp c10::hip::compat::exp
+#define compat_ceil c10::hip::compat::ceil
+#define compat_floor c10::hip::compat::floor
+#define compat_log c10::hip::compat::log
+#define compat_pow c10::hip::compat::pow
+#define compat_sqrt c10::hip::compat::sqrt
+#define compat_tan c10::hip::compat::tan
+#define compat_abs c10::hip::compat::abs
+#define compat_log1p c10::hip::compat::log1p
+#else
+#define compat_exp std::exp
+#define compat_ceil std::ceil
+#define compat_floor std::floor
+#define compat_log std::log
+#define compat_pow std::pow
+#define compat_sqrt std::sqrt
+#define compat_tan std::tan
+#define compat_abs std::abs
+#define compat_log1p std::log1p
+#endif
+
+namespace {
+
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
+// we cannot use std::isnan directly due to some incompatibility of
+// gcc constexpr'ing and nvcc
+using std::isnan;
+#endif
+
+// Here sampler_t should be function type scalar_t(void). For gpu
+// "sampler" is a device function, but since ROCM doesn't have
+// equivalent to nvstd::function, we use a template type parameter to
+// capture it.
+template<typename scalar_t, typename sampler_t>
+struct BaseSampler {
+  sampler_t sampler;
+  C10_DEVICE BaseSampler(const sampler_t& sampler): sampler(sampler) {}
+  C10_DEVICE scalar_t sample() {
+    return sampler();
+  }
+};
+
+// The function `sample_gamma` is
+// is adapted from Numpy's distributions.c implementation.
+// It is MIT licensed, so here is the copyright:
+
+/* Copyright 2005 Robert Kern (robert.kern@gmail.com)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+template<typename scalar_t, typename accscalar_t, typename uniform_sampler_t, typename normal_sampler_t>
+C10_DEVICE scalar_t sample_gamma(scalar_t alpha, BaseSampler<accscalar_t, uniform_sampler_t>& standard_uniform, BaseSampler<accscalar_t, normal_sampler_t>& standard_normal) {
+  accscalar_t scale = 1.0f;
+
+  // Boost alpha for higher acceptance probability.
+  if (alpha < 1.0f) {
+    if (alpha == 0.f) return 0.f;
+    scale *= compat_pow(1 - standard_uniform.sample(), 1.0f / alpha);
+    alpha += 1.0f;
+  }
+
+  // This implements the acceptance-rejection method of Marsaglia and Tsang (2000)
+  // doi:10.1145/358407.358414
+  const accscalar_t d = alpha - 1.0f / 3.0f;
+  const accscalar_t c = 1.0f / compat_sqrt(9.0f * d);
+  for (;;) {
+    accscalar_t x, y;
+    do {
+      x = standard_normal.sample();
+      y = 1.0f + c * x;
+    } while (y <= 0);
+    const accscalar_t v = y * y * y;
+    const accscalar_t u = 1 - standard_uniform.sample();
+    const accscalar_t xx = x * x;
+    if (u < 1.0f - 0.0331f * xx * xx)
+      return static_cast<scalar_t>(scale * d * v);
+    if (compat_log(u) < 0.5f * xx + d * (1.0f - v + compat_log(v)))
+      return static_cast<scalar_t>(scale * d * v);
+  }
+}
+
+/* the functions stirling_approx_tail, binomial_inversion, and btrs are adapted
+ * from TensorFlow's random_binomial_op.cc implementation. That code is under
+ * copyright: 2019 The TensorFlow Authors.
+ *
+ * It was released under the Apache License, Version 2.0 (the "License"), available at:
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ */
+
+template<typename scalar_t>
+C10_DEVICE scalar_t stirling_approx_tail(scalar_t k) {
+  const static scalar_t kTailValues[] = {
+    0.0810614667953272,
+    0.0413406959554092,
+    0.0276779256849983,
+    0.02079067210376509,
+    0.0166446911898211,
+    0.0138761288230707,
+    0.0118967099458917,
+    0.0104112652619720,
+    0.00925546218271273,
+    0.00833056343336287
+  };
+  if (k <= 9) {
+    return kTailValues[static_cast<size_t>(k)];
+  }
+  scalar_t kp1sq = (k + 1) * (k + 1);
+  return (1.0 / 12 - (1.0 / 360 - 1.0 / 1260 / kp1sq) / kp1sq) / (k + 1);
+}
+
+
+template<typename scalar_t, typename accscalar_t, typename uniform_sampler_t>
+C10_DEVICE scalar_t binomial_inversion(scalar_t count, scalar_t prob, BaseSampler<accscalar_t, uniform_sampler_t>& standard_uniform) {
+  accscalar_t U;
+  accscalar_t geom_sum = 0;
+  scalar_t num_geom = 0;
+
+  accscalar_t logprob = compat_log1p(-prob);
+
+  while (1) {
+    U = standard_uniform.sample();
+    accscalar_t geom = compat_ceil(compat_log(U) / logprob);
+    geom_sum += geom;
+    if (geom_sum > count) {
+      break;
+    }
+    num_geom = num_geom + 1;
+  }
+  return num_geom;
+}
+
+template<typename scalar_t, typename accscalar_t, typename uniform_sampler_t>
+C10_DEVICE scalar_t btrs(scalar_t count, scalar_t prob, BaseSampler<accscalar_t, uniform_sampler_t>& standard_uniform) {
+  scalar_t k;
+  accscalar_t U, V, us;
+
+  // This is spq in the paper.
+  const accscalar_t stddev = compat_sqrt(count * prob * (1 - prob));
+
+  // Other coefficients for Transformed Rejection sampling.
+  const accscalar_t b = 1.15 + 2.53 * stddev;
+  const accscalar_t a = -0.0873 + 0.0248 * b + 0.01 * prob;
+  const accscalar_t c = count * prob + 0.5;
+  const accscalar_t v_r = 0.92 - 4.2 / b;
+  const accscalar_t r = prob / (1 - prob);
+
+  const accscalar_t alpha = (2.83 + 5.1 / b) * stddev;
+  const accscalar_t m = compat_floor((count + 1) * prob);
+
+  while (1) {
+    U = standard_uniform.sample() - 0.5;
+    V = standard_uniform.sample();
+
+    us = 0.5 - compat_abs(U);
+    k = static_cast<scalar_t>(compat_floor((2 * a / us + b) * U + c));
+
+    // Reject non-sensical answers.
+    if (k < 0 || k > count) {
+      continue;
+    }
+    // Region for which the box is tight, and we can return our calculated value.
+    // This should happen 0.86 * v_r times. In the limit as n * p is large,
+    // the acceptance rate converges to ~79% (and in the lower regime it is ~24%).
+    if (us >= 0.07 && V <= v_r) {
+      return k;
+    }
+
+    // This deviates from Hormann's BTRS algorithm, as there is a log missing.
+    // For all (u, v) pairs outside of the bounding box, this calculates the
+    // transformed-reject ratio.
+    V = compat_log(V * alpha / (a / (us * us) + b));
+    accscalar_t upperbound =
+        ((m + 0.5) * compat_log((m + 1) / (r * (count - m + 1))) +
+         (count + 1) * compat_log((count - m + 1) / (count - k + 1)) +
+         (k + 0.5) * compat_log(r * (count - k + 1) / (k + 1)) +
+         stirling_approx_tail<accscalar_t>(m) + stirling_approx_tail<accscalar_t>(count - m) -
+         stirling_approx_tail<accscalar_t>(k) - stirling_approx_tail<accscalar_t>(count - k));
+
+    if (V <= upperbound) {
+      return k;
+    }
+  }
+}
+
+template<typename scalar_t, typename accscalar_t, typename uniform_sampler_t>
+C10_DEVICE scalar_t sample_binomial(scalar_t count, scalar_t prob, BaseSampler<accscalar_t, uniform_sampler_t>& standard_uniform) {
+  if (count <= 0.0 || prob <= 0.0) {
+    return 0;
+  } else if (prob >= 1.0) {
+    return count;
+  } else if (prob <= 0.5) {
+    if (count * prob >= 10.0) {
+      // btrs
+      return btrs<scalar_t, accscalar_t, uniform_sampler_t>(count, prob, standard_uniform);
+    } else {
+      // binomial inversion
+      return binomial_inversion<scalar_t, accscalar_t, uniform_sampler_t>(count, prob, standard_uniform);
+    }
+  } else if (prob > 0.5) {
+    scalar_t qprob = 1.0 - prob;
+    if (count * qprob >= 10.0) {
+      // btrs
+      return count - btrs<scalar_t, accscalar_t, uniform_sampler_t>(count, qprob, standard_uniform);
+    } else {
+      // count - binomial inversion
+      return count - binomial_inversion<scalar_t, accscalar_t, uniform_sampler_t>(count, qprob, standard_uniform);
+    }
+  } else {
+    // prob is nan?
+    return static_cast<scalar_t>(NAN);
+  }
+}
+
+/*
+ * This function is derived from the implementation of the digamma function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library] in ATen/native/Math.h.
+ */
+template<typename scalar_t, typename accscalar_t>
+C10_DEVICE inline scalar_t digamma_one(scalar_t x) {
+  constexpr accscalar_t PSI_10 = 2.25175258906672110764;
+  if (x == 0) {
+    return INFINITY;
+  }
+  accscalar_t additional_summand = 0;
+  int x_is_integer = x == compat_floor(x);
+  if (x < 0) {
+    if (x_is_integer) {
+      return INFINITY;
+    }
+    // it is more standard to write this as recursion, but
+    // nvcc does not like that
+    additional_summand = -c10::pi<scalar_t> /
+        compat_tan(c10::pi<scalar_t> * x);
+    x = 1 - x;
+  }
+
+  // Push x to be >= 10
+  accscalar_t result = 0;
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    return result + PSI_10 + additional_summand;
+  }
+
+  // Compute asymptotic digamma
+  static const accscalar_t A[] = {
+     8.33333333333333333333E-2,
+    -2.10927960927960927961E-2,
+     7.57575757575757575758E-3,
+    -4.16666666666666666667E-3,
+     3.96825396825396825397E-3,
+    -8.33333333333333333333E-3,
+     8.33333333333333333333E-2,
+  };
+
+  accscalar_t y = 0;
+  if (x < 1.0e17f) {
+    accscalar_t z = 1.0 / (x * x);
+    y = z * polevl<accscalar_t>(z, A, 6);
+  }
+  return static_cast<scalar_t>(
+      result + compat_log(x) - (0.5f / x) - y + additional_summand);
+}
+
+// Computes the reparameterized gradient -(d/dalpha cdf(x;alpha)) / pdf(x;alpha)
+// for random number x drawn from a standard Gamma distribution Gamma(alpha).
+template <typename scalar_t, typename accscalar_t>
+C10_HOST_DEVICE scalar_t standard_gamma_grad_one(scalar_t alpha_, scalar_t x_) {
+  // Use a Taylor series expansion for small x.
+  accscalar_t x = static_cast<accscalar_t>(x_);
+  accscalar_t alpha = static_cast<accscalar_t>(alpha_);
+  if (x < 0.8f) {
+    accscalar_t numer = 1;
+    accscalar_t denom = alpha;
+    auto series1 = numer / denom;
+    auto series2 = numer / (denom * denom);
+    for (int i = 1; i <= 5; ++i) {
+      numer *= -x / static_cast<accscalar_t>(i);
+      denom += 1;
+      series1 += numer / denom;
+      series2 += numer / (denom * denom);
+    }
+    const auto pow_x_alpha = compat_pow(x, alpha);
+    const auto gamma_pdf = compat_pow(x, alpha - 1) * compat_exp(-x);
+    const auto gamma_cdf = pow_x_alpha * series1;
+    const auto gamma_cdf_alpha =
+        (compat_log(x) - digamma_one<accscalar_t, accscalar_t>(alpha)) *
+            gamma_cdf -
+        pow_x_alpha * series2;
+    const auto result = -gamma_cdf_alpha / gamma_pdf;
+    return isnan(result) ? static_cast<scalar_t>( 0.f ) : static_cast<scalar_t>(result);
+  }
+
+  // Use a Rice saddle point expansion for large alpha.
+  if (alpha > 8.0f) {
+    if (0.9f * alpha <= x && x <= 1.1f * alpha) {
+      const auto numer_1 = 1 + 24 * alpha * (1 + 12 * alpha);
+      const auto numer_2 = 1440 * (alpha * alpha) + 6 * x * (53 - 120 * x)
+          - 65 * x * x / alpha + alpha * (107 + 3600 * x);
+      const auto denom = 1244160 * (alpha * alpha) * (alpha * alpha);
+      return static_cast<scalar_t>(numer_1 * numer_2 / denom);
+    }
+    const auto denom = compat_sqrt(8 * alpha);
+    const auto term2 = denom / (alpha - x);
+    const auto term3 = compat_pow(
+        x - alpha - alpha * compat_log(x / alpha),
+        static_cast<accscalar_t>(-1.5));
+    const auto term23 = (x < alpha) ? term2 - term3 : term2 + term3;
+    const auto term1 = compat_log(x / alpha) * term23 -
+        compat_sqrt(2 / alpha) * (alpha + x) / ((alpha - x) * (alpha - x));
+    const auto stirling = 1 + 1 / (12 * alpha) * (1 + 1 / (24 * alpha));
+    const auto numer = x * term1;
+    return static_cast<scalar_t>(-stirling * numer / denom);
+  }
+
+  // Use a bivariate rational approximation to the reparameterized gradient.
+  const auto u = compat_log(x / alpha);
+  const auto v = compat_log(alpha);
+  static const accscalar_t coef_uv[3][8] = {
+    {0.16009398, -0.094634809, 0.025146376, -0.0030648343,
+     1, 0.32668115, 0.10406089, 0.0014179084},
+    {0.53487893, 0.1298071, 0.065735949, -0.0015649758,
+     0.16639465, 0.020070113, -0.0035938915, -0.00058392623},
+    {0.040121004, -0.0065914022, -0.0026286047, -0.0013441777,
+     0.017050642, -0.0021309326, 0.00085092367, -1.5247877e-07},
+  };
+  accscalar_t coef_v[8];
+  for (int i = 0; i < 8; ++ i) {
+    coef_v[i] = coef_uv[0][i] + u * (coef_uv[1][i] + u * coef_uv[2][i]);
+  }
+  const auto p = coef_v[0] + v * (coef_v[1] + v * (coef_v[2] + v * coef_v[3]));
+  const auto q = coef_v[4] + v * (coef_v[5] + v * (coef_v[6] + v * coef_v[7]));
+  return static_cast<scalar_t>(compat_exp(p / q));
+}
+
+// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha.
+// Assumes x is close to zero and uses a Taylor expansion.
+template <typename scalar_t, typename accscalar_t>
+C10_DEVICE inline scalar_t _beta_grad_alpha_small(scalar_t x, scalar_t alpha, scalar_t beta) {
+  const scalar_t factor = digamma_one<scalar_t, accscalar_t>(alpha)
+                        - digamma_one<scalar_t, accscalar_t>(alpha + beta) - compat_log(x);
+  scalar_t numer = 1;
+  scalar_t series = numer / alpha * (factor + 1 / alpha);
+  for (int i = 1; i <= 10; ++i) {
+    scalar_t casted_i = static_cast<scalar_t>(i);
+    numer *= (casted_i - beta) * x / casted_i;
+    const scalar_t denom = alpha + casted_i;
+    series += numer / denom * (factor + 1 / denom);
+  }
+  const scalar_t result = x * compat_pow(1 - x, -beta) * series;
+  return isnan(result) ? static_cast<scalar_t>( 0.f ) : result;
+}
+
+// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt beta.
+// Assumes x is close to zero and uses a Taylor expansion.
+template <typename scalar_t, typename accscalar_t>
+C10_DEVICE inline scalar_t _beta_grad_beta_small(scalar_t x, scalar_t alpha, scalar_t beta) {
+  const scalar_t factor = digamma_one<scalar_t, accscalar_t>(alpha + beta) - digamma_one<scalar_t, accscalar_t>(beta);
+  scalar_t numer = 1, betas = 1, dbetas = 0, series = factor / alpha;
+  for (int i = 1; i <= 8; ++i) {
+    scalar_t casted_i = static_cast<scalar_t>(i);
+    numer *= -x / casted_i;
+    dbetas = dbetas * (beta - casted_i) + betas;
+    betas = betas * (beta - casted_i);
+    series += numer / (alpha + casted_i) * (dbetas + factor * betas);
+  }
+  const scalar_t result = -compat_pow(1 - x, 1 - beta) * series;
+  return isnan(result) ? static_cast<scalar_t>( 0.f ) : result;
+}
+
+// Approximate reparameterized gradient of Beta(x,alpha,beta) wrt alpha.
+// Assumes alpha and beta are both large and uses a Rice saddle point expansion.
+// To ensure numerical stability, this computation is performed at higher precision.
+template<typename scalar_t, typename accscalar_t>
+C10_DEVICE inline scalar_t _beta_grad_alpha_mid(accscalar_t x, accscalar_t alpha, accscalar_t beta) {
+  const accscalar_t total = alpha + beta;
+  const accscalar_t mean = alpha / total;
+  const accscalar_t std = compat_sqrt(alpha * beta / (total + 1)) / total;
+  if (mean - 0.1 * std <= x && x <= mean + 0.1 * std) {
+    // Avoid the singularity at x = mean.
+    const accscalar_t poly = 47 * x * (beta * beta) * (beta * beta) + alpha * (
+                           (43 + 20 * (16 + 27 * beta) * x) * (beta * beta) * beta + alpha * (
+                           3 * (59 + 180 * beta - 90 * x) * (beta * beta) + alpha * (
+                           (453 + 1620 * beta * (1 - x) - 455 * x) * beta + alpha * (
+                           8 * (1 - x) * (135 * beta - 11)))));
+    const accscalar_t prefactor_num = (1 + 12 * alpha) * (1 + 12 * beta) / (total * total);
+    const accscalar_t prefactor_den = 12960 * alpha * alpha * alpha * beta * beta * (1 + 12 * total);
+    return prefactor_num / (1 - x) * poly / prefactor_den;
+  }
+  const accscalar_t prefactor = -x / compat_sqrt(2 * alpha * beta / total);
+  const accscalar_t stirling = (1 + 1 / (12 * alpha) + 1 / (288 * alpha * alpha))
+                             * (1 + 1 / (12 * beta) + 1 / (288 * beta * beta))
+                             / (1 + 1 / (12 * total) + 1 / (288 * total * total));
+  const accscalar_t term1_num = 2 * (alpha * alpha) * (x - 1) + alpha * beta * (x - 1) - x * (beta * beta);
+  const accscalar_t axbx = alpha * (x - 1) + beta * x;
+  const accscalar_t term1_den = compat_sqrt(2 * alpha / beta) * compat_pow(total, static_cast<accscalar_t>(1.5f)) * axbx * axbx;
+  const accscalar_t term1 = term1_num / term1_den;
+  const accscalar_t term2 = 0.5f * compat_log(alpha / (total * x));
+  const accscalar_t term3_num = compat_sqrt(8 * alpha * beta / total);
+  const accscalar_t term3_den = beta * x + alpha * (x - 1);
+  const accscalar_t term3 = term3_num / term3_den;
+  const accscalar_t term4_base = beta * compat_log(beta / (total * (1 - x))) +
+                               alpha * compat_log(alpha / (total * x));
+  const accscalar_t term4 = compat_pow(term4_base, static_cast<accscalar_t>(-1.5f));
+  const accscalar_t term1234 = term1 + term2 * (term3 + (x < mean ? term4 : -term4));
+  return static_cast<scalar_t>(stirling * prefactor * term1234);
+}
+
+// Computes a scaled reparameterized gradient
+//   -(d/dalpha cdf(x;alpha,beta)) / pdf(x;alpha,beta) / (1-x)
+// for random number x drawn from a Beta distribution Beta(alpha,beta).
+// This function inputs total=alpha+beta to make it easy to implement
+// Dirichlet reparameterized gradients in terms of Betas.
+template<typename scalar_t, typename accscalar_t>
+C10_HOST_DEVICE inline scalar_t dirichlet_grad_one(scalar_t x, scalar_t alpha, scalar_t total) {
+  accscalar_t x_ = static_cast<accscalar_t>(x);
+  accscalar_t alpha_ = static_cast<accscalar_t>(alpha);
+  accscalar_t total_ = static_cast<accscalar_t>(total);
+
+  const scalar_t beta = total - alpha;
+  const accscalar_t beta_ = total_ - alpha_;
+  const scalar_t boundary = total * x * (1 - x);
+
+  // Use an asymptotic approximation for x close to 0.
+  if (x <= 0.5f && boundary < 2.5f) {
+    return _beta_grad_alpha_small<scalar_t, accscalar_t>(x, alpha, beta);
+  }
+
+  // Use an asymptotic approximation for x close to 1.
+  if (x >= 0.5f && boundary < 0.75f) {
+    return -_beta_grad_beta_small<scalar_t, accscalar_t>(1 - x, beta, alpha);
+  }
+
+  // Use an asymptotic approximation when alpha and (total - alpha) are both large.
+  if (alpha > 6 && beta > 6) {
+    return _beta_grad_alpha_mid<scalar_t, accscalar_t>(x_, alpha_, beta_);
+  }
+
+  // Use a rational correction to an analytic approximation.
+  static const accscalar_t c[2][3][3][4] = {
+    {{{1.003668233, -0.01061107488, -0.0657888334, 0.01201642863},
+      {0.6336835991, -0.3557432599, 0.05486251648, -0.001465281033},
+      {-0.03276231906, 0.004474107445, 0.002429354597, -0.0001557569013}},
+     {{0.221950385, -0.3187676331, 0.01799915743, 0.01074823814},
+      {-0.2951249643, 0.06219954479, 0.01535556598, 0.001550077057},
+      {0.02155310298, 0.004170831599, 0.001292462449, 6.976601077e-05}},
+     {{-0.05980841433, 0.008441916499, 0.01085618172, 0.002319392565},
+      {0.02911413504, 0.01400243777, -0.002721828457, 0.000751041181},
+      {0.005900514878, -0.001936558688, -9.495446725e-06, 5.385558597e-05}}},
+    {{{1, -0.02924021934, -0.04438342661, 0.007285809825},
+      {0.6357567472, -0.3473456711, 0.05454656494, -0.002407477521},
+      {-0.03301322327, 0.004845219414, 0.00231480583, -0.0002307248149}},
+     {{0.5925320577, -0.1757678135, 0.01505928619, 0.000564515273},
+      {0.1014815858, -0.06589186703, 0.01272886114, -0.0007316646956},
+      {-0.007258481865, 0.001096195486, 0.0003934994223, -4.12701925e-05}},
+     {{0.06469649321, -0.0236701437, 0.002902096474, -5.896963079e-05},
+      {0.001925008108, -0.002869809258, 0.0008000589141, -6.063713228e-05},
+      {-0.0003477407336, 6.959756487e-05, 1.097287507e-05, -1.650964693e-06}}},
+  };
+  const accscalar_t u = compat_log(x_);
+  const accscalar_t a = compat_log(alpha_) - u;
+  const accscalar_t b = compat_log(total_) - a;
+  const accscalar_t pow_u[3] = {1, u, u * u};
+  const accscalar_t pow_a[3] = {1, a, a * a};
+  accscalar_t p = 0.0;
+  accscalar_t q = 0.0;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      const accscalar_t ua = pow_u[i] * pow_a[j];
+      p += ua * (c[0][i][j][0] + b * (c[0][i][j][1] + b * (c[0][i][j][2] + b * c[0][i][j][3])));
+      q += ua * (c[1][i][j][0] + b * (c[1][i][j][1] + b * (c[1][i][j][2] + b * c[1][i][j][3])));
+    }
+  }
+  const accscalar_t approx = x_ * (digamma_one<scalar_t, accscalar_t>(total_) - digamma_one<scalar_t, accscalar_t>(alpha_)) / beta_;
+  return static_cast<scalar_t>(p / q * approx);
+}
+
+} // namespace
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb29e1171dcd60a5d9fbc5cb35b488603c1b0886
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h
@@ -0,0 +1,153 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/Config.h>
+#include <cstdint>
+
+#ifdef USE_FBGEMM
+#include <fbgemm/FbgemmEmbedding.h>
+#endif
+
+namespace at::native {
+
+enum class EmbeddingBagMode {
+  SUM = 0,
+  MEAN = 1,
+  MAX = 2,
+};
+
+[[maybe_unused]] static bool operator==(int64_t op1, EmbeddingBagMode op2) {
+  return op1 == static_cast<int64_t>(op2);
+}
+
+[[maybe_unused]] static bool operator!=(int64_t op1, EmbeddingBagMode op2) {
+  return !(op1 == op2);
+}
+
+void check_arguments(
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const int64_t mode,
+    const std::optional<Tensor>& per_sample_weights,
+    bool include_last_offset);
+
+void make_bag_size_out(
+    Tensor& bag_size_out,
+    const Tensor& offsets,
+    const Tensor& indices,
+    const int64_t mode,
+    const bool include_last_offset,
+    const bool requires_grad);
+
+void make_max_indices_out(
+    Tensor& max_indices_out,
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const Tensor& bag_size,
+    const int64_t mode,
+    bool include_last_offset);
+
+void make_offset2bag_out(
+    Tensor& offset2bag,
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const int64_t mode,
+    const std::optional<Tensor>& per_sample_weights,
+    const int64_t padding_idx = -1);
+
+#ifdef USE_FBGEMM
+
+template<bool has_weight, typename TIndex, typename TData>
+struct _CallbackAndBlockSize {
+    using TCallback = typename fbgemm::EmbeddingSpMDMKernelSignature<TData, TIndex, TIndex, TData>::Type;
+
+    int64_t blockSize = -1;
+    TCallback callback = nullptr;
+
+    static TCallback generateCallback(int64_t block_size) {
+        return fbgemm::GenerateEmbeddingSpMDM<TData, TIndex, TIndex, TData>(
+                block_size,
+                has_weight,
+                /* normalize_by_lengths */false,
+                /* prefetch */16,
+                /* is_weight_positional */false,
+                /* use_offsets */true);
+    }
+
+    _CallbackAndBlockSize() = default;
+
+    explicit _CallbackAndBlockSize(std::optional<int64_t> maybe_block_size)
+      : blockSize(maybe_block_size.value_or(-1))
+      , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr)
+    {}
+};
+
+template<typename... StorageMixins>
+struct _EmbeddingBagKernelCacheImpl : private StorageMixins... {
+
+    _EmbeddingBagKernelCacheImpl() = default;
+    // use each of the mixins to store corresponding kernel and block size
+    explicit _EmbeddingBagKernelCacheImpl(std::optional<int64_t> maybe_block_size)
+      : StorageMixins(maybe_block_size)...
+    {}
+
+    // this method is thread safe (call sites may call from different threads)
+    template<bool has_weight, typename TIndex, typename TData>
+    typename _CallbackAndBlockSize<has_weight, TIndex, TData>::TCallback
+    getCallback(int64_t block_size) const {
+        // if the cache doesn't store the kernel for the incoming block size
+        // (so it is different from the one stored in corresponding mixin)
+        // regenerate the kernel (not writing it into the cache so we avoid locks)
+        if (block_size != _CallbackAndBlockSize<has_weight, TIndex, TData>::blockSize) {
+            return _CallbackAndBlockSize<has_weight, TIndex, TData>::generateCallback(block_size);
+        }
+        // else retrieve the cached kernel from the corresponding mixin
+        return _CallbackAndBlockSize<has_weight, TIndex, TData>::callback;
+    }
+};
+
+// instantiate the cache with the list of storage mixins
+// for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file
+using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl<
+    _CallbackAndBlockSize<true, int32_t, float>,
+    _CallbackAndBlockSize<false, int32_t, float>,
+    _CallbackAndBlockSize<true, int64_t, float>,
+    _CallbackAndBlockSize<false, int64_t, float>,
+    _CallbackAndBlockSize<true, int32_t, unsigned short>,
+    _CallbackAndBlockSize<false, int32_t, unsigned short>,
+    _CallbackAndBlockSize<true, int64_t, unsigned short>,
+    _CallbackAndBlockSize<false, int64_t, unsigned short>>;
+#else
+struct _EmbeddingBagKernelCache {
+    explicit _EmbeddingBagKernelCache(std::optional<int64_t> /* maybe_block_size */) {}
+};
+#endif
+
+void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag,
+    Tensor& bag_size, Tensor* max_indices,
+    const Tensor &weight, const Tensor &indices,
+    const Tensor &offsets, const int64_t mode = 0,
+    const std::optional<Tensor>& per_sample_weights = std::nullopt,
+    bool include_last_offset = false,
+    int64_t padding_idx = -1,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+
+void _embedding_bag_cpu_out(
+    at::Tensor& output,
+    at::Tensor& offset2bag,
+    at::Tensor& bag_size,
+    at::Tensor* p_max_indices,
+    const at::Tensor& weight,
+    const at::Tensor& indices,
+    const at::Tensor& offsets,
+    const bool scale_grad_by_freq,
+    const int64_t mode,
+    const bool sparse,
+    const std::optional<at::Tensor>& per_sample_weights,
+    const bool include_last_offset,
+    const std::optional<int64_t>& padding_idx,
+    _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ForeachUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ForeachUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8fbe13b8da0b9016e88c937ccbd2781eee0e160
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ForeachUtils.h
@@ -0,0 +1,396 @@
+#pragma once
+
+#include <ATen/Device.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ScalarType.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/utils/ParamsHash.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/result_type_native.h>
+#endif
+
+#include <unordered_map>
+#include <vector>
+
+namespace at::native {
+namespace {
+// Check if tensor list has either a boolean tensor or a integer tensor
+inline bool has_integral_tensor(TensorList tensors, const bool includeBool) {
+  return std::any_of(
+      tensors.begin(), tensors.end(), [&includeBool](const auto& t) {
+        return at::isIntegralType(t.scalar_type(), includeBool);
+      });
+}
+// check if tensor list has bool tensors
+inline bool has_bool_tensor(TensorList tensors) {
+  return std::any_of(tensors.begin(), tensors.end(), [](const auto& t) -> bool {
+    return t.scalar_type() == ScalarType::Bool;
+  });
+}
+
+// Check foreach API restrictions
+// - Tensor lists must be non-empty.
+// - All TensorLists and ScalarLists must have the same number of elements.
+// - Corresponding tensors must have the same size.
+inline void check_foreach_api_restrictions(TensorList tensors) {
+  TORCH_CHECK(!tensors.empty(), "Tensor list must have at least one tensor.");
+}
+
+inline void check_foreach_api_restrictions(
+    TensorList tensors,
+    ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors);
+  TORCH_CHECK(
+      tensors.size() == scalars.size(),
+      "Tensor list must have same number of elements as scalar list.");
+}
+
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2) {
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(
+      tensors1.size() == tensors2.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors2.size());
+}
+
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2,
+    TensorList tensors3) {
+  TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(!tensors3.empty(), "Tensor list must have at least one tensor.");
+  TORCH_CHECK(
+      tensors1.size() == tensors2.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors2.size());
+  TORCH_CHECK(
+      tensors1.size() == tensors3.size(),
+      "Tensor lists must have the same number of tensors, got ",
+      tensors1.size(),
+      " and ",
+      tensors3.size());
+}
+
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2,
+    TensorList tensors3,
+    ArrayRef<Scalar> scalars) {
+  check_foreach_api_restrictions(tensors1, tensors2, tensors3);
+  TORCH_CHECK(
+      tensors1.size() == scalars.size(),
+      "Tensor list must have same number of elements as scalar list, got ",
+      tensors1.size(),
+      " and ",
+      scalars.size());
+}
+
+// Helper function called in check_fast_path_restrictions to check whether all
+// corresponding tensors (aligning in index across the tensorLists) share the
+// same device and dtype.
+inline bool _check_tensors_share_device_and_dtype(
+    ArrayRef<TensorList> tensorLists,
+    const bool skip_dtype_check = false) {
+  const auto expected_dtype = tensorLists[0][0].dtype();
+  const auto expected_device = tensorLists[0][0].device();
+
+  auto is_tensor_okay = [&](const Tensor& tensor) {
+    return (skip_dtype_check || tensor.dtype() == expected_dtype) &&
+        tensor.device() == expected_device && tensor.layout() == at::kStrided &&
+        tensor.is_non_overlapping_and_dense();
+  };
+
+  for (const auto& tensorList : tensorLists) {
+    for (const auto& tensor : tensorList) {
+      if (!is_tensor_okay(tensor)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Helper function called in check_fast_path_restrictions to check if
+// corresponding tensors in tensor lists have the same sizes and strides.
+inline bool _check_tensors_share_sizes_and_strides(
+    ArrayRef<TensorList> tensorLists) {
+  auto is_diff_stride = [](const IntArrayRef& size,
+                           const IntArrayRef& left_stride,
+                           const IntArrayRef& right_stride) -> bool {
+    const size_t size_size = size.size();
+    for (const auto dim : c10::irange(size_size)) {
+      if (size[dim] == 1)
+        continue;
+      if (left_stride[dim] != right_stride[dim]) {
+        return true;
+      }
+    }
+    return false;
+  };
+  for (const auto i : c10::irange(1, tensorLists.size())) {
+    for (const auto j : c10::irange(tensorLists[0].size())) {
+      if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() ||
+          is_diff_stride(
+              tensorLists[0][j].sizes(),
+              tensorLists[0][j].strides(),
+              tensorLists[i][j].strides())) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// Helper function called in check_fast_path_restrictions to check whether
+// all tensors type promote properly with the scalars in scalarList. This
+// function assumes that _check_tensors_share_device_and_dtype has already been
+// called so that all corresponding tensors in tensorLists have the same dtype.
+// Then, it is sufficient to check the type promotion with just one tensorList.
+inline bool _check_tensors_do_type_promotion_with_scalars(
+    TensorList tensorList,
+    ArrayRef<Scalar> scalarList = {},
+    bool does_op_promote_integer_inputs_to_float = false) {
+  for (const auto i : c10::irange(tensorList.size())) {
+    // For division, integer inputs will result in float.
+    if (does_op_promote_integer_inputs_to_float) {
+      if (at::isIntegralType(
+              tensorList[i].scalar_type(), /*includeBool*/ true)) {
+        return false;
+      }
+    }
+    if (!scalarList.empty()) {
+      const auto& scalar =
+          scalarList.size() == 1 ? scalarList[0] : scalarList[i];
+      const auto& tensor = tensorList[i];
+      // note(mkozuki): This check might be responsible for
+      // `_foreach_add(bool_tensors, bool_tensors)` being pushed to slow path.
+      if (tensor.scalar_type() != at::native::result_type(scalar, tensor)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// To go via 'fast' path, several conditions must be satisfied
+// - All tensors in all lists must have the same dtype.
+// - All tensors must be on the same device
+// - All tensors must have strided layout
+// - All tensors must be non-overlapping and dense
+// - Resulting tensor must have the same dtype as the input one
+
+// [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
+//     ``does_op_promote_integer_inputs_to_float=true`` means that the result of
+//     the op will be float even if inputs are integer or boolean, which
+//     currently fast path does not support. In short, this flag, when
+//     turned on, gatekeeps the op from going down the fastpath.
+
+// Please, make sure to call check_foreach_api_restrictions before calling this
+// method. There is a set of preconditions that have to be satisfied.
+inline bool check_fast_path_restrictions(
+    ArrayRef<TensorList> tensorLists,
+    ArrayRef<Scalar> scalarList = {},
+    bool does_op_promote_integer_inputs_to_float = false) {
+  return _check_tensors_share_device_and_dtype(tensorLists) &&
+      _check_tensors_share_sizes_and_strides(tensorLists) &&
+      _check_tensors_do_type_promotion_with_scalars(
+             tensorLists[0],
+             scalarList,
+             does_op_promote_integer_inputs_to_float);
+}
+
+inline std::vector<c10::Scalar> convert_tensor_to_scalar_list(
+    const Tensor& scalarList_,
+    int64_t expect_length) {
+  std::vector<c10::Scalar> scalarList;
+  TORCH_CHECK(
+      scalarList_.device() == c10::kCPU,
+      "Expected scalars to be on CPU, got ",
+      scalarList_.device(),
+      " instead.");
+  TORCH_CHECK(
+      scalarList_.is_contiguous(), "Expected scalars to be contiguous.");
+  TORCH_CHECK(
+      scalarList_.dim() == 1,
+      "Expected packed scalar Tensor to be of dimension 1. Got ",
+      scalarList_.dim(),
+      " instead.");
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
+      kComplexHalf,
+      kHalf,
+      kBool,
+      kBFloat16,
+      scalarList_.scalar_type(),
+      "convert_tensor_to_scalar_list",
+      [&]() {
+        const scalar_t* scalar_data = scalarList_.const_data_ptr<scalar_t>();
+        TORCH_CHECK(
+            (expect_length == scalarList_.size(0)),
+            "Expected length of scalars to match input of length ",
+            expect_length,
+            " but got ",
+            scalarList_.size(0),
+            " instead.");
+        for (int64_t i = 0; i < scalarList_.size(0); i++) {
+          scalarList.emplace_back(scalar_data[i]);
+        }
+      });
+  return scalarList;
+}
+
+// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
+inline bool can_use_fast_route(
+    ArrayRef<TensorList> tensorLists,
+    ArrayRef<Scalar> scalarList = {},
+    bool does_op_promote_integer_inputs_to_float = false) {
+  return check_fast_path_restrictions(
+      tensorLists, scalarList, does_op_promote_integer_inputs_to_float);
+}
+
+// see: [note: what's ``does_op_promote_integer_inputs_to_float=true``?]
+inline bool can_use_fast_route(
+    TensorList tensors1,
+    TensorList tensors2,
+    bool does_op_promote_integer_inputs_to_float = false) {
+  return can_use_fast_route(
+      {tensors1, tensors2}, {}, does_op_promote_integer_inputs_to_float);
+}
+
+using DeviceDtypeKey = std::pair<at::Device, at::ScalarType>;
+using IndicesT = std::vector<size_t>;
+using nested_optional_tensorvec_t =
+    std::vector<std::vector<std::optional<at::Tensor>>>;
+using TensorsAndIndicesT = std::pair<nested_optional_tensorvec_t, IndicesT>;
+using FlatMap = std::unordered_map<
+    DeviceDtypeKey,
+    TensorsAndIndicesT,
+    ParamsHash<DeviceDtypeKey>>;
+
+inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
+    const nested_optional_tensorvec_t& nested_tensorlist,
+    const bool with_indices) {
+  FlatMap grouped_tensors_with_indices;
+
+  TORCH_CHECK(!nested_tensorlist.empty());
+  TORCH_CHECK(!nested_tensorlist[0].empty());
+  const auto num_lists = nested_tensorlist.size();
+  const auto num_tensors = nested_tensorlist[0].size();
+
+  TORCH_CHECK(std::all_of(
+      nested_tensorlist.cbegin(),
+      nested_tensorlist.cend(),
+      [&](const auto& tensorlist) -> bool {
+        // note(crcrpar): Allow empty tensorlists following
+        // ref:
+        // https://github.com/pytorch/pytorch/blob/85885301fd3c6adb8b9dc3cf7afadf6945566684/torch/utils/_foreach_utils.py#L21-L24
+        return tensorlist.size() == num_tensors || tensorlist.size() == 0;
+      }));
+
+  for (const auto& tensor_index : c10::irange(num_tensors)) {
+    const auto key = [&]() -> DeviceDtypeKey {
+      const auto t = nested_tensorlist[0][tensor_index];
+      TORCH_CHECK(
+          t.has_value(),
+          "Tensors of the first list of nested Tensor lists are supposed to be defined but ",
+          "the ",
+          tensor_index,
+          "-th Tensor is not.");
+      return {t->device(), t->scalar_type()};
+    }();
+    TORCH_CHECK(
+        std::all_of(
+            nested_tensorlist.cbegin(),
+            nested_tensorlist.cend(),
+            [&](const auto& tensorlist) -> bool {
+              if (tensorlist.size() == 0) {
+                return true;
+              }
+              const auto& tensor = tensorlist[tensor_index];
+              // note(crcrpar): Currently the scope of this function is
+              // optimizers so there could be `state_steps` and other scalars
+              // whose elements are float tensors no matter what the parameter's
+              // dtype is.
+              if (!tensor.has_value()) {
+                return true;
+              } else {
+                const auto s = tensor->scalar_type();
+                const auto d = tensor->device();
+                // Note: `step` or `state_step` is float32 by default.
+                if (key.first == d) {
+                  return key.second == s || s == at::ScalarType::Float ||
+                      s == at::ScalarType::Double;
+                } else if (d.is_cpu()) {
+                  // note(crcrpar): There are some test cases (e.g.
+                  // TestOptim::test_adam) where state_steps are on CPU and the
+                  // others are on CUDA. Currently a state_step Tensor has the
+                  // dtype of float.
+                  return s == at::ScalarType::Float ||
+                      s == at::ScalarType::Double;
+                } else {
+                  return false;
+                }
+              }
+            }),
+        "Tensors of the same index must be on the same device and the same dtype except `step` tensors that can be CPU and float32/64 notwithstanding");
+    if (!grouped_tensors_with_indices.count(key)) {
+      grouped_tensors_with_indices.insert(
+          {key,
+           TensorsAndIndicesT{
+               [&]() -> nested_optional_tensorvec_t {
+                 nested_optional_tensorvec_t nested_tensorvec;
+                 nested_tensorvec.reserve(num_lists);
+                 for (const auto& i : c10::irange(num_lists)) {
+                   std::vector<std::optional<at::Tensor>> tensors;
+                   if (!nested_tensorlist[i].empty()) {
+                     // NB: num_tensors is the max possible length for any of
+                     // the inner lists of tensor references. Reserving the max
+                     // trades memory for perf. This should not have significant
+                     // impact.
+                     tensors.reserve(num_tensors);
+                   }
+                   nested_tensorvec.emplace_back(tensors);
+                 }
+                 return nested_tensorvec;
+               }(),
+               [&]() -> IndicesT {
+                 if (!with_indices) {
+                   return {};
+                 } else {
+                   IndicesT indices;
+                   indices.reserve(num_tensors);
+                   return indices;
+                 }
+               }()}});
+    }
+    for (const auto& list_index : c10::irange(num_lists)) {
+      if (!nested_tensorlist[list_index].empty()) {
+        grouped_tensors_with_indices[key].first[list_index].emplace_back(
+            nested_tensorlist[list_index][tensor_index]);
+      }
+    }
+    if (with_indices) {
+      grouped_tensors_with_indices[key].second.emplace_back(tensor_index);
+    }
+  }
+
+  return grouped_tensors_with_indices;
+}
+
+} // namespace
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdagrad.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdagrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1e415ba9a3dd826626223af39264c82ab4899ee
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdagrad.h
@@ -0,0 +1,20 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using fused_adagrad_fn = void (*)(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& state_sum,
+    const at::Tensor& state_step,
+    const double lr,
+    const double lr_decay,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const float* grad_scale_ptr);
+
+DECLARE_DISPATCH(fused_adagrad_fn, fused_adagrad_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdam.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdam.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e26cc2d84c6dab2bde4fd73f4a601c30faa0edd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedAdam.h
@@ -0,0 +1,27 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 };
+
+using fused_adam_fn = void (*)(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& exp_avg,
+    const at::Tensor& exp_avg_sq,
+    const at::Tensor& max_exp_avg_sq,
+    const at::Tensor& state_step,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool amsgrad,
+    const bool maximize,
+    const float* grad_scale_ptr,
+    const ADAM_MODE);
+
+DECLARE_DISPATCH(fused_adam_fn, fused_adam_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedSGD.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedSGD.h
new file mode 100644
index 0000000000000000000000000000000000000000..c58d10b6a4a4e585cb6196f553c73aa667d5150a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FusedSGD.h
@@ -0,0 +1,21 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using fused_sgd_fn = void (*)(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& momentum_buffer,
+    const double weight_decay,
+    const double momentum,
+    const double lr,
+    const double dampening,
+    const bool nesterov,
+    const bool maximize,
+    const bool is_first_step,
+    const float* grad_scale_ptr);
+
+DECLARE_DISPATCH(fused_sgd_fn, fused_sgd_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSamplerUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSamplerUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f783043c7961222302ed0ff2514ffbf138cafac2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSamplerUtils.h
@@ -0,0 +1,105 @@
+#pragma once
+
+// See NOTE: [Tensor vs. TensorBase]
+// https://github.com/pytorch/pytorch/pull/66979
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/TensorProperties.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
+
+namespace at::native {
+
+namespace detail {
+
+enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic};
+enum class GridSamplerPadding {Zeros, Border, Reflection};
+
+} // namespace detail
+
+using detail::GridSamplerInterpolation;
+using detail::GridSamplerPadding;
+
+// See NOTE [ grid_sampler Native Functions ].
+inline void check_grid_sampler_common(
+  const TensorBase& input,
+  const TensorBase& grid
+) {
+  auto input_opt = input.options();
+  auto grid_opt = grid.options();
+
+  TORCH_CHECK(
+    input.defined(),
+    "grid_sampler(): expected input to not be undefined");
+  TORCH_CHECK(
+    grid.defined(),
+    "grid_sampler(): expected grid to not be undefined");
+  TORCH_CHECK(
+    input_opt.device() == grid_opt.device(),
+    "grid_sampler(): expected input and grid to be on same device, but input "
+    "is on ", input_opt.device(), " and grid is on ", grid_opt.device());
+  TORCH_CHECK(
+    input_opt.layout() == kStrided && grid_opt.layout() == kStrided,
+    "grid_sampler(): expected input and grid to have torch.strided layout, but "
+    "input has ", input_opt.layout(), " and grid has ", grid_opt.layout());
+  TORCH_CHECK(
+    input.size(0) == grid.size(0),
+    "grid_sampler(): expected grid and input to have same batch size, but got "
+    "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes());
+  TORCH_CHECK(
+    grid.size(-1) == input.dim() - 2,
+    "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last "
+    "dimension, but got grid with sizes ", grid.sizes());
+
+  for (const auto i : c10::irange(2, input.dim())) {
+    TORCH_CHECK(input.size(i) > 0,
+      "grid_sampler(): expected input to have non-empty spatial dimensions, "
+      "but input has sizes ", input.sizes(), " with dimension ", i, " being "
+      "empty");
+  }
+}
+
+// See NOTE [ grid_sampler Native Functions ].
+inline void check_grid_sampler_2d(
+  const TensorBase& input,
+  const TensorBase& grid
+) {
+  TORCH_CHECK(
+    input.dim() == 4 && input.dim() == grid.dim(),
+    "grid_sampler(): expected 4D input and grid with same number of "
+    "dimensions, but got input with sizes ", input.sizes(),
+    " and grid with sizes ", grid.sizes());
+}
+
+// See NOTE [ grid_sampler Native Functions ].
+inline void check_grid_sampler_3d(
+  const TensorBase& input,
+  const TensorBase& grid,
+  int64_t interpolation_mode
+) {
+  TORCH_CHECK(
+    input.dim() == 5 && input.dim() == grid.dim(),
+    "grid_sampler(): expected 5D input and grid with same number of "
+    "dimensions, but got input with sizes ", input.sizes(),
+    " and grid with sizes ", grid.sizes());
+  TORCH_CHECK(
+    !(input.dim() == 5 &&
+      static_cast<GridSamplerInterpolation>(interpolation_mode) ==
+        GridSamplerInterpolation::Bicubic),
+    "grid_sampler(): bicubic interpolation only supports 4D input");
+}
+
+// See NOTE [ grid_sampler Native Functions ].
+// cudnn does not support inputs larger than 1024.
+inline bool cond_cudnn_grid_sampler(
+  const TensorBase& input,
+  const TensorBase& grid
+) {
+  return (
+    at::native::cudnn_is_acceptable(input) &&
+    at::native::cudnn_is_acceptable(grid) &&
+    at::native::canUse32BitIndexMath(input) &&
+    at::native::canUse32BitIndexMath(grid) &&
+    input.dim() == 4 &&
+    input.sym_size(1) <= 1024);
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h
new file mode 100644
index 0000000000000000000000000000000000000000..fee7e06b87258138bcafb360cd48955640b25eeb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Histogram.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using histogramdd_fn = void(*)(const Tensor&, const std::optional<Tensor>&, bool, Tensor&, const TensorList&);
+using histogramdd_linear_fn = void(*)(const Tensor&, const std::optional<Tensor>&, bool, Tensor&, const TensorList&, bool);
+using histogram_select_outer_bin_edges_fn = void(*)(const Tensor& input, const int64_t N, std::vector<double> &leftmost_edges, std::vector<double> &rightmost_edges);
+
+DECLARE_DISPATCH(histogramdd_fn, histogramdd_stub);
+DECLARE_DISPATCH(histogramdd_linear_fn, histogramdd_linear_stub);
+DECLARE_DISPATCH(histogram_select_outer_bin_edges_fn, histogram_select_outer_bin_edges_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b0a787f0e87a64bced49d9c920ebd66262bd26a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexKernel.h
@@ -0,0 +1,41 @@
+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+
+namespace at {
+class Tensor;
+class TensorBase;
+struct TensorIterator;
+struct TensorIteratorBase;
+}
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at::native {
+
+using index_fn = void(*)(TensorIteratorBase &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides);
+using index_fill_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride, const Scalar& source);
+using index_copy_fn = void(*)(TensorIterator & iter, int64_t dim, int64_t self_dim_size, int64_t self_dim_stride);
+using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate);
+using put_fn = void(*)(TensorIterator & iter, const TensorBase& self, const bool accumulate);
+using take_fn = void(*)(TensorIterator & iter, const TensorBase& input);
+using flip_fn = void(*)(TensorIterator &, const bool);
+using masked_fill_fn = void(*)(TensorIterator &, const Scalar& scalar);
+using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride);
+using masked_scatter_fn = void(*)(TensorIterator &, const TensorBase &);
+
+DECLARE_DISPATCH(index_fn, index_stub);
+DECLARE_DISPATCH(index_fill_fn, index_fill_stub);
+DECLARE_DISPATCH(index_copy_fn, index_copy_stub);
+DECLARE_DISPATCH(index_put_fn, index_put_stub);
+DECLARE_DISPATCH(put_fn, put_stub);
+DECLARE_DISPATCH(take_fn, take_stub);
+DECLARE_DISPATCH(flip_fn, flip_stub);
+DECLARE_DISPATCH(masked_fill_fn, masked_fill_stub);
+DECLARE_DISPATCH(masked_select_fn, masked_select_serial_stub);
+DECLARE_DISPATCH(masked_select_fn, masked_select_stub);
+DECLARE_DISPATCH(masked_scatter_fn, masked_scatter_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..cef21c3fd80d5d82ebd16a43d58510ed793871c6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/IndexingUtils.h
@@ -0,0 +1,160 @@
+#pragma once
+#include <ATen/ExpandUtils.h>
+#include <ATen/native/CanUse32BitIndexMath.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/core/IListRef.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+
+[[noreturn]]
+static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask, int64_t maskIdx) {
+  TORCH_CHECK_INDEX(false, "The shape of the mask ", mask.sizes(), " at index ", maskIdx,
+  " does not match the shape of the indexed tensor ", self.sizes(), " at index ", idx);
+}
+
+
+static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, IOptTensorListRef indices) {
+  // If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors
+  std::vector<Tensor> result;
+  for (const auto& index_opt : indices) {
+    if (!index_opt.has_value()) {
+      result.emplace_back();
+    } else {
+      const auto& index = *index_opt;
+      if (index.scalar_type() == kByte || index.scalar_type() == kBool) {
+        if (index.scalar_type() == kByte) {
+          TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \
+          " please use a dtype torch.bool instead.");
+        }
+        // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
+        // corresponding dimensions in self
+        for (const auto j : c10::irange(index.dim())) {
+          int64_t srcIdx = static_cast<int64_t>(result.size() + j);
+          if (index.size(j) != self.size(srcIdx)) {
+            invalid_mask(self, srcIdx, index, j);
+          }
+        }
+        // Replace with nonzeros
+        auto nonzero = index.nonzero();
+        for (const auto j : c10::irange(index.dim())) {
+          result.emplace_back(nonzero.select(1, j));
+        }
+      } else {
+        result.emplace_back(index);
+      }
+    }
+  }
+  return result;
+}
+
+static C10_UNUSED void checkIndexTensorTypes(IOptTensorListRef indices, bool allow_int=false) {
+  for (const auto& tensor : indices) {
+    if (tensor.has_value() && tensor->defined()) {
+      auto scalarType = tensor->scalar_type();
+      if (allow_int) {
+        if (scalarType != kLong && scalarType != kByte && scalarType != kBool && scalarType != kInt) {
+            TORCH_CHECK_INDEX(false, "tensors used as indices must be long, int, byte or bool tensors");
+        }
+      } else {
+        if (scalarType != kLong && scalarType != kByte && scalarType != kBool) {
+            TORCH_CHECK_INDEX(false, "tensors used as indices must be long, byte or bool tensors");
+        }
+      }
+    }
+  }
+}
+
+inline torch::List<std::optional<Tensor>> toListOfOptionalTensors(ArrayRef<Tensor> list) {
+  torch::List<std::optional<Tensor>> result;
+  result.reserve(list.size());
+  for (const Tensor& a : list) {
+    result.push_back(a);
+  }
+  return result;
+}
+
+inline torch::List<std::optional<Tensor>> toListOfOptionalTensors(ArrayRef<IValue> list) {
+  torch::List<std::optional<Tensor>> result;
+  result.reserve(list.size());
+  for (const IValue& a : list) {
+    result.push_back(a.isTensor() ? std::optional<Tensor>(a.toTensor()) : std::optional<Tensor>());
+  }
+  return result;
+}
+
+static C10_UNUSED bool hasContiguousSubspace(TensorList tl) {
+  // true if all the non-null tensors are adjacent
+  auto isDefined = [](const Tensor & tensor){ return tensor.defined(); };
+  auto isNull = [](const Tensor & tensor){ return !tensor.defined(); };
+  auto start = std::find_if(tl.begin(), tl.end(), isDefined);
+  auto stop = std::find_if(tl.rbegin(), tl.rend(), isDefined);
+  auto it = std::find_if(start, stop.base(), isNull);
+  return it == stop.base();
+}
+
+
+// Transposes the tensor and indices together so that all the non-null indices
+// index the first k dimensions of the tensor. Returns the transposed tensor
+// and the reordered indices. For example:
+// transposeToFront(tensor, {nullptr, a, nullptr, b})
+// returns
+// tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr}
+static C10_UNUSED std::tuple<Tensor, std::vector<Tensor>>
+transposeToFront(const Tensor& self, TensorList indices) {
+  std::vector<int64_t> dims;
+  std::vector<Tensor> transposedIndices;
+  dims.reserve(self.dim());
+  for (const auto i : c10::irange(self.dim())) {
+    if (indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back(indices[i]);
+    }
+  }
+  for (const auto i : c10::irange(self.dim())) {
+    if (!indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back();
+    }
+  }
+  return std::make_tuple(self.permute(dims), std::move(transposedIndices));
+}
+
+inline std::tuple<Tensor, std::vector<Tensor>, std::vector<int64_t>>
+transposeToFrontAndInvPerm(const Tensor& self, TensorList indices) {
+  std::vector<int64_t> dims;
+  std::vector<int64_t> invPerm;
+  std::vector<Tensor> transposedIndices;
+  dims.reserve(self.dim());
+  invPerm.resize(self.dim());
+  for (const auto i : c10::irange(self.dim())) {
+    if (indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back(indices[i]);
+    }
+  }
+  for (const auto i : c10::irange(self.dim())) {
+    if (!indices[i].defined()) {
+      dims.push_back(i);
+      transposedIndices.emplace_back();
+    }
+  }
+  for (const auto i : c10::irange(self.dim())) {
+    invPerm[dims[i]] = i;
+  }
+  return std::make_tuple(self.permute(dims), std::move(transposedIndices), std::move(invPerm));
+}
+
+struct AdvancedIndex {
+  AdvancedIndex(const Tensor& src, TensorList indices);
+
+  Tensor src;
+  std::vector<Tensor> indices;
+  DimVector indexed_sizes;
+  DimVector indexed_strides;
+  int64_t dims_before;
+  int64_t dims_after;
+};
+
+
+} //namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Lerp.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Lerp.h
new file mode 100644
index 0000000000000000000000000000000000000000..6db4f60b88ea1e19a1cc744f9f8c1e7b31c66a82
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Lerp.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+template <typename scalar_t>
+C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(scalar_t weight) {
+  return std::abs(weight) < scalar_t(0.5);
+}
+template <typename scalar_t>
+C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(c10::complex<scalar_t> weight) {
+  // Avoid the sqrt in abs(weight)
+  return (weight.real() * weight.real() + weight.imag() * weight.imag()) < scalar_t(0.25);
+}
+
+template <typename scalar_t, typename weight_t>
+C10_HOST_DEVICE C10_ALWAYS_INLINE scalar_t lerp(scalar_t self_, scalar_t end_, weight_t weight_) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  using opmath_weight_t = at::opmath_type<weight_t>;
+
+  opmath_t self = self_;
+  opmath_t end = end_;
+  opmath_weight_t weight = weight_;
+
+  // Conditional for better numeric. This has been discussed in
+  // https://github.com/pytorch/pytorch/pull/18871
+  return is_lerp_weight_small(weight)
+      ? self + weight * (end - self)
+      : end - (end - self) * (opmath_t(1) - weight);
+}
+
+using lerp_fn_scalar = void (*)(
+    at::TensorIteratorBase& iter,
+    const Scalar& weight);
+
+using lerp_fn_tensor = void (*)(
+    at::TensorIteratorBase& iter);
+
+DECLARE_DISPATCH(lerp_fn_scalar, lerp_kernel_scalar_weight);
+DECLARE_DISPATCH(lerp_fn_tensor, lerp_kernel_tensor_weight);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebra.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebra.h
new file mode 100644
index 0000000000000000000000000000000000000000..c613020599c2e03aacb15f80119a42aaf9fa877b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebra.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at {
+struct TensorIterator;
+}
+
+namespace at::native {
+
+using addr_fn = void (*)(TensorIterator &, const Scalar& beta, const Scalar& alpha);
+DECLARE_DISPATCH(addr_fn, addr_stub);
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..48d9c31129654815c0542f63bf9d1d9cb9ab1dd3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h
@@ -0,0 +1,623 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/irange.h>
+#include <c10/util/Exception.h>
+#include <c10/util/strides.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/TransposeType.h>
+#include <limits>
+#include <type_traits>
+#include <sstream>
+#include <cstring>
+#include <cctype>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace at::native {
+
+inline c10::MaybeOwned<Tensor> expect_resolved_conj(const Tensor& tensor) {
+  if (tensor.is_conj()) {
+    return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
+  } else {
+    return c10::MaybeOwned<Tensor>::borrowed(tensor);
+  }
+}
+
+inline DimVector batched_matrix_contiguous_strides(
+    const IntArrayRef sizes,
+    const bool f_contig = false) {
+  // f_contig chooses between the strides of a batch of Fortran (F-contiguous)
+  // and C-contiguous matrices
+  auto strides = c10::contiguous_strides(sizes);
+  auto dim = strides.size();
+
+  if (f_contig && dim >= 2) {
+    // Fix the strides of the last two dimensions, so that we return
+    // C-contiguous batches of F-contiguous matrices.
+    strides[dim - 1] = std::max(sizes[dim - 2], static_cast<int64_t>(1));
+    strides[dim - 2] = 1;
+  }
+  return strides;
+}
+
+/*
+ * Clones a Tensor so that the following conditions hold:
+ * If we think of a Tensor of having size (B, M, N), where B is any number
+ * of batch dimensions, then:
+ * - Each (M, N) matrix is in column major form
+ * - Let Tensor P have size (B, M, N) and Q have size (B, M', N').
+ *   Then when laid out in memory, the M by N matrix starting at
+ *   P.data_ptr()[B * M * N] is of the same corresponding batch as the M' by N'
+ *   matrix starting at Q.data_ptr()[B * M' * N'].
+ */
+inline Tensor cloneBatchedColumnMajor(const Tensor& src) {
+  // If src is already in batched column major format, then
+  // this will be efficient (no reordering of the data will occur)
+  // because the first transpose will make the tensor contiguous,
+  // and cloning a contiguous tensor is fast.
+  auto result = src.mT().clone(at::MemoryFormat::Contiguous);
+  result.transpose_(-2, -1);
+  return result;
+}
+
+/*
+ * contig chooses between C-contig (true) and F-contig (false)
+ */
+inline c10::MaybeOwned<Tensor> borrow_else_clone(const bool cond, const Tensor& borrow, const Tensor& clone, const bool contig) {
+  return cond ? c10::MaybeOwned<Tensor>::borrowed(borrow)
+              : c10::MaybeOwned<Tensor>::owned(contig ? clone.clone(MemoryFormat::Contiguous)
+                                                      : cloneBatchedColumnMajor(clone));
+}
+
+/*
+ * This method is designed to be a faster alternative to
+ * `cloneBatchedColumnMajor` with some additional features,
+ * namely:
+ * 1. It uses `copy` instead of `clone` which could be much faster.
+ * 2. `nrows` parameter used to create inputs with the number of rows larger
+ *  than the original input, which is required for some LAPACK/MAGMA methods.
+ * 3. `desired_batch_size` is used to create copies with the batch size
+ *  which is either the original batch size of the input, or its larger
+ *  broadcasted shape.
+ */
+inline Tensor copyBatchedColumnMajor(const Tensor& src, int64_t nrows = -1,
+    at::OptionalIntArrayRef desired_batch_sizes = std::nullopt) {
+  nrows = (nrows == -1) ? src.size(-2) : nrows;
+  auto copy_sizes = desired_batch_sizes.has_value()
+    ? desired_batch_sizes.value().vec()
+    : IntArrayRef(src.sizes().data(), src.dim() - 2).vec();
+  copy_sizes.insert(copy_sizes.end(), {nrows, src.size(-1)});
+  const auto copy_strides = batched_matrix_contiguous_strides(copy_sizes, /*f-contig*/true);
+  auto copy = at::empty_strided(copy_sizes, copy_strides, src.options());
+  copy.narrow(-2, 0, src.size(-2)).copy_(src);
+  return copy;
+}
+
+/*
+ * Given batches of matrices with arbitrary batch dim,
+ * computes the number of batches.
+ */
+inline int64_t batchCount(const Tensor& batched_matrices) {
+  int64_t result = 1;
+  for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) {
+    result *= batched_matrices.size(i);
+  }
+  return result;
+}
+
+// Computes the number of elements of a matrix in a batched matrix tensor
+inline int64_t matrixStride(const Tensor& batched_matrices) {
+  return batched_matrices.size(-1) * batched_matrices.size(-2);
+}
+
+// Validates input shapes for operations on batches of square matrices (inverse, cholesky, symeig, eig)
+inline void checkIsMatrix(const Tensor& A, const char* const f_name, const char* const arg_name = "A") {
+  TORCH_CHECK(A.dim() >= 2, f_name, ": The input tensor ", arg_name, " must have at least 2 dimensions.");
+}
+inline void squareCheckInputs(const Tensor& self, const char* const f_name, const char* const arg_name = "A") {
+  checkIsMatrix(self, f_name, arg_name);
+  TORCH_CHECK(self.sym_size(-1) == self.sym_size(-2),
+              f_name,
+              ": ", arg_name, " must be batches of square matrices, "
+              "but they are ", self.sym_size(-2), " by ", self.sym_size(-1), " matrices");
+}
+
+inline void checkInputsSolver(const Tensor& A,
+                                     const Tensor& B,
+                                     const bool left,
+                                     const char* const f_name) {
+  squareCheckInputs(A, f_name, "A");
+  checkIsMatrix(B, f_name, "B");
+  TORCH_CHECK(left ? A.size(-2) == B.size(-2) : A.size(-1) == B.size(-1),
+              f_name, ": Incompatible shapes of A and B for the equation ",
+              left ? "AX = B" : "XA = B",
+              " (", A.size(-2), "x", A.size(-1), " and ", B.size(-2), "x", B.size(-1), ")");
+}
+
+inline bool is_row_or_column_contiguous(const Tensor& t) {
+  // This could be made more general, similar to how it's checked in matmul, which would allow to
+  // ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
+  // We choose to be conservative for simplicity
+  return t.is_contiguous() || t.transpose(-2, -1).is_contiguous();
+}
+
+inline TransposeType to_transpose_type(const bool contig, const bool conj) {
+  if (conj) {
+    if (contig) { TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); }
+    else {        return TransposeType::ConjTranspose; }
+  } else {
+    if (contig) { return TransposeType::NoTranspose; }
+    else {        return TransposeType::Transpose; }
+  }
+}
+
+
+// This function is designed to be used with linear algebra methods that minimize
+// L(ax - b) = 0, where L is generally the identity map (`solve`, for example)
+// or the L2 norm (`lstsq`).
+// It is expected that `a` and `b` are contiguous tensors of column-major matrices
+// (so that a.view({-1, a.size(-2), a.size(-1)}) succeeds, same for `b`),
+// with the following additional properties:
+//
+// 1. a.dim() == b.dim()
+// 2. a.shape[:-2] broadcasts over b.shape[:-2]
+// 3. a.size(i) <= b.size(i) for i=0,..., a.dim() - 3 (only for batch dimensions)
+//
+// MAGMA/LAPACK modify tensor `a` in-place, and the main goal of this method
+// is to be memory efficient, which means that if there exists an index i such that
+// a.shape[i] < b.shape[i], 0 <= i <= a.dim() - 3,
+// then instead of materializing copies of `a` in the broadcasted shape, we keep
+// a buffer copy of `a` along with flags that check whether specific batch dimension
+// indices for `a` were already accessed. If they were, we copy the data from the buffer
+// into `a`. The number of copies does not exceed
+// prod(max(a.shape[:-2], b.shape[:-2]) - a.shape[:-2] + 1)
+// and this value is attained by tensors with non-empty batch dimensions.
+//
+// func_t `f` is a callable that is being supplied with
+// scalar_t* a_working_ptr, scalar_t* b_working_ptr, int64_t a_linear_batch_idx.
+// a_working_ptr and b_working_ptr can directly be passed to LAPACK/MAGMA routines,
+// and a_linear_batch_idx is an index in the 3d representation which corresponds to
+// the memory a_working_ptr points to, in other words:
+// a_working_ptr == a.view({-1, a.size(-2), a.size(-1)}.select(0, a_linear_batch_idx).data_ptr<scalar_t>();
+// a_linear_batch_idx is useful to store metadata related to `a`, such as, for example,
+// its rank or singular values (see linalg_lstsq).
+template<typename scalar_t, typename func_t>
+void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const func_t& f) {
+  IntArrayRef a_batch_sizes(a.sizes().data(), a.dim() - 2);
+  IntArrayRef b_batch_sizes(b.sizes().data(), b.dim() - 2);
+
+  auto a_linear_batch_idx = at::arange(batchCount(a)).view(a_batch_sizes);
+  auto b_linear_batch_idx = at::arange(batchCount(b)).view(b_batch_sizes);
+
+  TensorIterator iter = TensorIteratorConfig()
+    .set_check_mem_overlap(false)
+    .check_all_same_dtype(false)
+    .resize_outputs(false)
+    .add_output(b_linear_batch_idx)
+    .add_input(a_linear_batch_idx)
+    .build();
+
+  auto m = a.size(-2);
+  auto n = a.size(-1);
+  auto a_3d = a.view({batchCount(a), m, n});
+  auto b_3d = b.view({batchCount(b), b.size(-2), b.size(-1)});
+
+  auto a_broadcasts_over_b = (a_batch_sizes != b_batch_sizes);
+  Tensor a_buffer, a_was_accessed, a_buffer_3d;
+  std::function<void(int64_t)> check_if_copy_needed_for_a
+    = [](int64_t /*a_curr_linear_batch_idx*/){};
+  if (a_broadcasts_over_b) {
+    a_buffer = at::empty_strided(a.sizes(), a.strides(), a.options())
+      .copy_(a);
+    a_was_accessed = at::zeros(batchCount(a), at::kBool);
+    a_buffer_3d = a_buffer.view({batchCount(a), m, n});
+    check_if_copy_needed_for_a = [&](int64_t a_curr_linear_batch_idx) {
+      auto* a_was_accessed_flag = a_was_accessed
+        .select(0, a_curr_linear_batch_idx)
+        .data_ptr<bool>();
+      if (!(*a_was_accessed_flag)) {
+        *a_was_accessed_flag = true;
+      }
+      else {
+        a_3d.select(0, a_curr_linear_batch_idx)
+          .copy_(a_buffer_3d.select(0, a_curr_linear_batch_idx));
+      }
+    };
+  }
+
+  auto loop = [&](char** data, const int64_t* strides, int64_t nelems) {
+    auto* b_batch_idx_ptr = data[0];
+    auto* a_batch_idx_ptr = data[1];
+
+    for (const auto elem C10_UNUSED : c10::irange(nelems)) {
+      auto b_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(b_batch_idx_ptr);
+      auto a_curr_linear_batch_idx = *reinterpret_cast<int64_t*>(a_batch_idx_ptr);
+
+      check_if_copy_needed_for_a(a_curr_linear_batch_idx);
+
+      auto* a_working_ptr = a_3d.select(0, a_curr_linear_batch_idx)
+        .data_ptr<scalar_t>();
+      auto* b_working_ptr = b_3d.select(0, b_curr_linear_batch_idx)
+        .data_ptr<scalar_t>();
+      f(a_working_ptr, b_working_ptr, a_curr_linear_batch_idx);
+
+      b_batch_idx_ptr += strides[0];
+      a_batch_idx_ptr += strides[1];
+    }
+  };
+  iter.serial_for_each(loop, {0, batchCount(b)});
+}
+
+// Returns the epsilon value for floating types except half
+inline double _get_epsilon(const ScalarType& sc_type) {
+  switch (sc_type) {
+    case at::ScalarType::Float:
+      return static_cast<double>(std::numeric_limits<float>::epsilon());
+    case at::ScalarType::Double:
+      return std::numeric_limits<double>::epsilon();
+    default:
+      AT_ERROR("This function doesn't handle types other than float and double");
+  }
+}
+
+// Validates input shapes and devices
+// for linear solve methods (solve, cholesky_solve, lu_solve, triangular_solve)
+inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A, const char* name) {
+  TORCH_CHECK(self.device() == A.device(),
+              "Expected b and A to be on the same device, but found b on ",
+              self.device(), " and A on ", A.device(), " instead.");
+
+  TORCH_CHECK(self.scalar_type() == A.scalar_type(),
+              "Expected b and A to have the same dtype, but found b of type ",
+              self.scalar_type(), " and A of type ", A.scalar_type(), " instead.");
+
+  TORCH_CHECK(A.size(-1) == A.size(-2),
+              "A must be batches of square matrices, "
+              "but they are ", A.size(-2), " by ", A.size(-1), " matrices");
+
+  TORCH_CHECK(A.size(-1) == self.size(-2),
+              "Incompatible matrix sizes for ", name, ": each A "
+              "matrix is ", A.size(-1), " by ", A.size(-1),
+              " but each b matrix is ", self.size(-2), " by ", self.size(-1));
+}
+
+inline void checkFloatingOrComplex(const Tensor& t, const char* const f_name, const bool allow_low_precision_dtypes=true) {
+  auto dtype = t.scalar_type();
+  TORCH_CHECK((at::isFloatingType(dtype) || at::isComplexType(dtype)),
+              f_name, ": Expected a floating point or complex tensor as input. Got ", dtype);
+  if (!allow_low_precision_dtypes) {
+    TORCH_CHECK(dtype == kFloat || dtype == kDouble || dtype == kComplexFloat || dtype == kComplexDouble,
+                f_name, ": Low precision dtypes not supported. Got ", dtype);
+  }
+}
+
+
+// Checks if all the Tensors in a TensorList are of the same dimensions
+inline void checkAllSameDim(TensorList tensors, int64_t dim) {
+  for (auto &t : tensors) {
+    TORCH_CHECK(t.dim() == dim, "Tensor dimension is ", t.dim(), ", expected ", dim, " instead.");
+  }
+}
+
+inline std::tuple<std::vector<int64_t>, std::vector<int64_t>> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) {
+  // broadcast the batch dimensions of arg1 and arg2.
+  IntArrayRef arg1_batch_sizes(arg1.sizes().data(), arg1.ndimension() - 2);
+  IntArrayRef arg2_batch_sizes(arg2.sizes().data(), arg2.ndimension() - 2);
+  std::vector<int64_t> expand_batch_portion = infer_size(arg1_batch_sizes, arg2_batch_sizes);
+
+  std::vector<int64_t> arg1_expand_size({expand_batch_portion});
+  arg1_expand_size.insert(arg1_expand_size.end(), { arg1.size(-2), arg1.size(-1) });
+
+  std::vector<int64_t> arg2_expand_size({expand_batch_portion});
+  arg2_expand_size.insert(arg2_expand_size.end(), { arg2.size(-2), arg2.size(-1) });
+  return std::make_tuple(std::move(arg1_expand_size), std::move(arg2_expand_size));
+}
+
+inline std::tuple<Tensor,Tensor> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) {
+  // If there's no name we assume we don't want to check the errors
+  if (name != nullptr) {
+    linearSolveCheckInputs(arg1, arg2, name);
+  }
+
+  auto [arg1_expand_size, arg2_expand_size] = at::native::_linalg_broadcast_batch_dims(arg1, arg2);
+
+  auto arg1_broadcasted  = arg1_expand_size == arg1.sizes() ? arg1 : arg1.expand(arg1_expand_size);
+  auto arg2_broadcasted  = arg2_expand_size == arg2.sizes() ? arg2 : arg2.expand(arg2_expand_size);
+  return std::make_tuple(arg1_broadcasted, arg2_broadcasted);
+}
+
+inline std::vector<int64_t> broadcast_batch_size(const Tensor& t1, const Tensor& t2, int64_t n_batch_dims) {
+  IntArrayRef t1_batch_sizes(t1.sizes().data(), n_batch_dims);
+  IntArrayRef t2_batch_sizes(t2.sizes().data(), n_batch_dims);
+  auto broadcasted_batch_sizes = infer_size(t1_batch_sizes, t2_batch_sizes);
+  return broadcasted_batch_sizes;
+}
+
+// Return a permutation with the given axes moved to the end.
+inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) {
+  const std::vector<int64_t> a = axes.vec();
+  const int64_t ndim = self.ndimension();
+  std::vector<int64_t> perm;
+
+  for (const auto i : c10::irange(ndim)) {
+    auto it = std::find(a.begin(), a.end(), i);
+    if (it == a.end()) {
+       perm.push_back(i);
+    }
+  }
+  for (auto i : a) {
+    perm.push_back(i);
+  }
+
+  TORCH_CHECK((int64_t)perm.size() == ndim,
+    "duplicate or invalid axis in 'dim' argument for tensor with ndim==", ndim);
+
+  return self.permute(perm);
+}
+
+// parse the "mode" param in linalg_qr: return a tuple of bools (compute_q, reduced)
+inline std::tuple<bool, bool> _parse_qr_mode(c10::string_view mode) {
+  bool compute_q;
+  bool reduced;
+  if (mode == "reduced") {
+    compute_q = true;
+    reduced = true;
+  } else if (mode == "complete") {
+    compute_q = true;
+    reduced = false;
+  } else if (mode == "r") {
+    compute_q = false;
+    reduced = true; // this is actually irrelevant in this mode
+  } else {
+      TORCH_CHECK(false, "qr received unrecognized mode '", mode,
+                  "' but expected one of 'reduced' (default), 'r', or 'complete'");
+  }
+  return std::make_tuple(compute_q, reduced);
+}
+
+// Function to compute sizes, strides and the extra columns for the Q matrix in the QR Decomposition
+inline std::tuple<DimVector, DimVector, int64_t> _compute_geometry_for_Q(
+    const Tensor& input,
+    bool reduced) {
+  int64_t m = input.size(-2), n = input.size(-1);
+  int64_t n_columns_q;
+
+  // We need to compute the required size of Q based on the `reduced` option
+  DimVector q_sizes(input.sizes());
+  if (!reduced && m > n) {
+    q_sizes[input.dim() - 1] = m;
+    n_columns_q = m;
+  } else {
+    q_sizes[input.dim() - 1] = n;
+    n_columns_q = std::min(m, n);
+  }
+  auto q_strides = batched_matrix_contiguous_strides(q_sizes, /*f-contig*/true);
+  return std::make_tuple(q_sizes, q_strides, n_columns_q);
+}
+
+inline bool svd_uses_cusolver(const Tensor& A) {
+  // if cusolver is available, it is used unconditionally
+  return A.is_cuda()
+         && at::globalContext().hasCuSOLVER()
+         && at::globalContext().linalgPreferredBackend() != at::LinalgBackend::Magma;
+}
+
+
+// Function used instead of .to so that the original strides are retained
+// .to doesn't retain strides and make the output tensor contiguous
+inline Tensor same_stride_to(const Tensor& original_tensor, const at::TensorOptions& options) {
+  auto strided_to = at::empty_strided(original_tensor.sizes(),
+                                      original_tensor.strides(),
+                                      options);
+  strided_to.copy_(original_tensor);
+  return strided_to;
+}
+
+// Creates a dimension permutation array that can be given to `at::permute()`, which will shift
+// the two specified dimensions to the end of a tensor, without changing the order of
+// the other dimensions. `dim1` will be placed at the very end, and `dim0` will be
+// placed just to the left of it.
+//
+// For instance, given a 4-D tensor, dimensions 1 and 3 can be shifted to the end by
+// calling `create_dim_backshift_permutation(1, 3, 4)`. The resulting vector will
+// be `vec(0, 2, 1, 3)`.
+inline std::vector<int64_t> create_dim_backshift_permutation(int64_t dim0, int64_t dim1, int64_t ndim) {
+  TORCH_CHECK(
+    (dim0 != dim1) && (dim0 < ndim) && (dim0 >= 0) && (dim1 < ndim) && (dim1 >= 0),
+    "duplicate or invalid dimensions");
+  std::vector<int64_t> permutation(ndim);
+  int64_t cur_permuted_dim = 0;
+  for (const auto dim_ind : c10::irange(ndim)) {
+    if ((dim_ind != dim0) && (dim_ind != dim1)) {
+      permutation[cur_permuted_dim++] = dim_ind;
+    }
+  }
+  permutation[cur_permuted_dim++] = dim0;
+  permutation[cur_permuted_dim] = dim1;
+  return permutation;
+}
+
+// Creates a dimension permutation array that can be given to `at::permute()`, which
+// will reverse a given permutation.
+// The reverse permutation array is created by swapping the indices and their
+// associated values from the given permutation array.
+inline std::vector<int64_t> create_reverse_permutation(std::vector<int64_t> permutation) {
+  int64_t ndim = permutation.size();
+  std::vector<int64_t> reverse_permutation(ndim);
+  for (const auto dim_ind : c10::irange(ndim)) {
+    reverse_permutation[permutation[dim_ind]] = dim_ind;
+  }
+  return reverse_permutation;
+}
+
+// Compute R-work array size for MAGMA/LAPACK cgesdd/zgesdd
+// See https://github.com/Reference-LAPACK/lapack/blob/122506cd8b6ce050a200920c3d4c0b153b150fd8/SRC/cgesdd.f#L186
+inline int64_t computeLRWorkDim(const char jobz, int64_t m, int64_t n) {
+  auto mn = std::min(m, n);
+  auto mx = std::max(m, n);
+  if (jobz == 'N') {
+#ifdef __APPLE__
+    // According to `vecLib.framework/Headers/clapack.h` Accelerate.framework is based on LAPACK 3.2.1
+    return 7 * mn;
+#else
+    // These setting is valid for on LAPACK 3.6+
+    return 5 * mn;
+#endif
+  }
+  if (mx > 10 * mn) {
+    return 5 * mn * mn + 5 * mn;
+  }
+  return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn);
+}
+
+// This function checks whether the uplo argument input is valid
+// Allowed strings are "u", "U", "l", "L"
+inline void checkUplo(const c10::string_view uplo) {
+  // To use std::toupper safely with plain chars (or signed chars), the argument should first be converted to unsigned char
+  char uplo_uppercase = static_cast<char>(std::toupper(static_cast<unsigned char>(uplo[0])));
+  TORCH_CHECK(uplo.size() == 1 && (uplo_uppercase == 'U' || uplo_uppercase == 'L'),
+    "Expected UPLO argument to be 'L' or 'U', but got ", uplo);
+}
+
+inline void checkSameDevice(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") {
+  TORCH_CHECK(
+      result.device() == input.device(),
+      fn_name,
+      ": Expected ", result_name, " and input tensors to be on the same device, but got ",
+      result_name, " on ", result.device(), " and input on ", input.device());
+}
+
+// Check the dtype of result and input tensors (for _out variants).
+// Most linear algebra functions have the same dtype for input and output
+// (either floating or complex type input), so we can check whether input's dtype can be casted to result's dtype.
+// According to https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-does-out-work-in-pytorch
+// c10::canCast is used for checking the "safe copy" dtype requirements.
+inline void checkLinalgCompatibleDtype(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") {
+  bool can_cast = c10::canCast(input.scalar_type(), result.scalar_type());
+  TORCH_CHECK(
+      can_cast,
+      fn_name,
+      ": Expected ", result_name, " to be safely castable from ", input.scalar_type(), " dtype, but got ",
+      result_name, " with dtype ", result.scalar_type());
+}
+
+// Alternatively, we can check whether the specific expected output type (result_type) can be safely casted to out tensor dtype (out_type)
+inline void checkLinalgCompatibleDtype(const std::string& fn_name, ScalarType out_type, ScalarType result_type, const std::string& out_name = "result") {
+  bool can_cast = c10::canCast(result_type, out_type);
+  TORCH_CHECK(
+      can_cast,
+      fn_name,
+      ": Expected ", out_name, " to be safely castable from ", result_type, " dtype, but got ",
+      out_name, " with dtype ", out_type);
+}
+
+inline void checkNotComplexTolerance(const Tensor& tol, const c10::string_view f_name, const c10::string_view tol_name) {
+  TORCH_CHECK(!at::isComplexType(tol.scalar_type()),
+              f_name, ": ", tol_name, " tensor of complex type is not supported. Got ", tol.scalar_type());
+}
+
+/*
+  Two types of 'other' tensors are supported when solving
+  a system of linear equations matmul(input, x) = other:
+  * 1-dimensional (1D) tensor or batch of 1D tensors (vector case)
+  * 2-dimensional (2D) tensor or batch of 2D tensors (matrix case).
+  The original torch.solve supported only the matrix case, while NumPy works for both cases.
+  For the batched input we need to be able to distinguish them.
+  Let input.shape = (batch_dimensions, m, n), then 'other' is of vector type if other.shape == (batch_dimensions, m).
+  This rule is compatible with NumPy, see https://github.com/numpy/numpy/blob/v1.20.0/numpy/linalg/linalg.py#L384-L389
+*/
+inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor& other) {
+  auto expected_batched_rhs_shape = SymIntArrayRef(input.sym_sizes().data(), input.dim() - 1); // input.shape[:-1]
+  bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sym_sizes().equals(expected_batched_rhs_shape));
+  return vector_case;
+}
+
+/*
+  Computes linear indices for a tensor with original_shape to access its elements like it was a materialized broadcast tensor.
+*/
+inline Tensor get_linear_indices(int64_t numel, IntArrayRef original_shape, IntArrayRef broadcast_shape) {
+  TensorOptions options = at::TensorOptions().dtype(at::kLong).device(at::kCPU);
+  return at::arange(numel, options).view(original_shape).broadcast_to(broadcast_shape).contiguous();
+}
+
+class BroadcastLinearIndices {
+ private:
+  Tensor linear_indices_;
+  bool is_broadcasting_;
+
+ public:
+  BroadcastLinearIndices(
+      int64_t numel,
+      IntArrayRef original_shape,
+      IntArrayRef broadcast_shape) : is_broadcasting_(!original_shape.equals(broadcast_shape)) {
+    // The assumption is that the broadcast_shape is a materialized broadcast
+    // shape of the original_shape. We need to compute the linear indices
+    // compatible with the original_shape to access the elements in the original
+    // tensor corresponding to the broadcast tensor.
+    if (is_broadcasting_) {
+      linear_indices_ =
+          get_linear_indices(numel, original_shape, broadcast_shape);
+    }
+  }
+  int64_t operator()(int64_t broadcast_linear_index) {
+    return is_broadcasting_
+        ? linear_indices_.data_ptr<int64_t>()[broadcast_linear_index]
+        : broadcast_linear_index;
+  }
+};
+
+inline bool is_blas_compatible_column_major_order(const Tensor& input) {
+  IntArrayRef input_strides = input.strides();
+  IntArrayRef input_sizes = input.sizes();
+  auto ndim = input.dim();
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2);
+  if (ndim > 3) {
+    return input.transpose(-2, -1).is_contiguous();
+  }
+  auto leading_dimension = input_strides[ndim - 1];
+  auto rows = input_sizes[ndim - 2];
+  bool batch_stride_compatible = true;
+  if (ndim == 3) {
+    auto cols = input_sizes[ndim - 1];
+    batch_stride_compatible =
+        input_strides[ndim - 3] >= leading_dimension * cols;
+  }
+  return (input_strides[ndim - 2] == 1) &&
+      (leading_dimension >= std::max<int64_t>(1, rows)) &&
+      batch_stride_compatible;
+}
+
+inline bool is_blas_compatible_row_major_order(const Tensor& input) {
+  IntArrayRef input_strides = input.strides();
+  IntArrayRef input_sizes = input.sizes();
+  auto ndim = input.dim();
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2);
+  if (ndim > 3) {
+    return input.is_contiguous();
+  }
+  auto leading_dimension = input_strides[ndim - 2];
+  auto cols = input_sizes[ndim - 1];
+  bool batch_stride_compatible = true;
+  if (ndim == 3) {
+    auto rows = input_sizes[ndim - 2];
+    batch_stride_compatible =
+        input_strides[ndim - 3] >= leading_dimension * rows;
+  }
+  return (input_strides[ndim - 1] == 1) &&
+      (leading_dimension >= std::max<int64_t>(1, cols)) &&
+      batch_stride_compatible;
+}
+
+}  // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..de2296634e04511497827128c4483c5e75c6a674
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitsFallback.h
@@ -0,0 +1,157 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/native/UnaryOps.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
+#include <torch/library.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/clone.h>
+
+#include <utility>
+#endif
+
+namespace at::native {
+// This fallback should only be used for operations that are self inverse and have a corresponding tensor
+// bit (internally implemented using DispatchKey) to maintain the state on tensor using tensor bit.
+// Currently there are two tensor bits that trigger this fallback: conjugate bit and negative bit.
+// Conjugate bit is set on a tensor when `.conj()` is called and neg bit is set on a tensor when `.conj().imag` is called.
+
+// NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit.
+struct MathOpFallback {
+  MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(std::move(op_name_)) {}
+  virtual bool is_bit_set(const Tensor&) = 0;
+  void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
+    /*
+      Situations to handle:
+        1. Out-of-place operation.  Easy: materialize all inputs and
+          call it a day.
+        2. Inplace operation.  Desugar x.add_(2) into x.conj_().add_(2).conj_().
+          Materialize other inputs as in (1).
+        3. out= operation.  Desugar add(x, 2, out=y) into y.copy_(add(x, 2))
+        Materialize other inputs as in (1).
+
+        It is important to be able to tell if we READ from an argument and if we
+        WRITE to an argument.  Conservative approach is to assume that we always
+        READ from an argument, but in out= operations you can skip
+        conjugating inputs on entry that never get used. In the current schema we
+        can't easily tell if the operation is in in-place or out= operation.
+
+        Note:
+        1. Mutable tensorlists containing tensors whose math bit set to true are disallowed.
+        2. Mutable tensors with math bit set to true are unconditionally cloned to ensure
+           correct behavior in the case when the mutable tensor shares memory with non mutable arguments.
+
+           If we were to in-place resolve the math bit for mutable inputs, then the non-mutable inputs sharing partial or full memory
+           with these mutable inputs would read into wrong values in the following cases:
+           1. Non mutable inputs have their math bit set to false.
+           2. Math bit for mutable input(s) is resolved before the non mutable inputs (with bit set to true and sharing memory
+              with one or more mutable arg(s)) are cloned.
+           At the end, the final value of the mutable arguments from the stack are copied into the original input mutable tensor inputs.
+    */
+    const auto& arguments = op.schema().arguments();
+    const auto num_arguments = arguments.size();
+    const auto stack_start = stack->size() - num_arguments;
+
+    std::optional<bool> is_write;
+    for (const auto i : c10::irange(num_arguments)) {
+      // Three possible states:
+      // 1. alias_info has no value --> out-of-place operation
+      // 2. alias_info does have a value, alias_info->is_write=True --> in-place or out= operation
+      // 3. alias_info does have a value, alias_info->is_write=False --> view operation
+      const AliasInfo* alias_info = arguments[i].alias_info();
+      if (alias_info != nullptr) {
+        if (is_write.has_value()) {
+          TORCH_CHECK(*is_write == alias_info->isWrite(),
+            "Unsupported operator for ", op_name, " fallback: ", op.schema().name(),
+            op_name, " fallback doesn't work for operators with a mix "
+            "mutable and non-mutable inputs that alias with outputs, "
+            "this must be implemented manually.  "
+            "If you got this error on a core op, please report a bug to PyTorch.");
+        } else {
+          is_write = alias_info->isWrite();
+        }
+      }
+    }
+
+    if (is_write.has_value() && !*is_write) {
+      // We assume that view operators automatically handle the math bit
+      // correctly by propagating the dispatch key in key_set.
+      // This is not necessarily always right, so you should test these cases.
+      op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack);
+      return;
+    }
+
+    // Mutable inputs with math bit set to True and their clones
+    std::vector<std::pair<Tensor, Tensor>> mutable_inputs_with_their_clones;
+    for (const auto i : c10::irange(num_arguments)) {
+      auto& ivalue = (*stack)[stack_start + i];
+      if (!(ivalue.isTensor() || ivalue.isTensorList())) {
+        continue;
+      }
+      const auto& argument = arguments[i];
+      bool mut_arg = false;
+      if (argument.alias_info()) {
+        // Was already tested by is_write loop above
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(argument.alias_info()->isWrite());
+        mut_arg = true;
+      }
+      if (ivalue.isTensor()) {
+        if (!is_bit_set(ivalue.toTensor())) {
+          continue;
+        }
+        auto tensor = std::move(ivalue).toTensor();
+        auto resolved_tensor = at::clone(tensor);
+        if (mut_arg) {
+          TORCH_CHECK(mutable_inputs_with_their_clones.empty(), op_name, " fallback does not support operators with more than one mutable tensors with ",
+            op_name, "bit set to true.");
+          mutable_inputs_with_their_clones.emplace_back(std::move(tensor), resolved_tensor);
+        }
+        (*stack)[stack_start + i] = std::move(resolved_tensor);
+      } else if (ivalue.isTensorList()) {
+        auto tensors = std::move(ivalue).toTensorList();
+        for(const auto j : c10::irange(tensors.size())) {
+          const auto& tensor = tensors[j];
+          if (!is_bit_set(tensor)) {
+            continue;
+          }
+          TORCH_CHECK(!mut_arg, " fallback doesn't currently support mutable TensorLists with ",
+              op_name, " inputs. Please materialize all the ", op_name, " input tensor(s) in the mutable TensorList inputs before calling ",
+              op.schema().name());
+          tensors[j] = at::clone(tensor);
+        }
+        (*stack)[stack_start + i] = std::move(tensors);
+      }
+    }
+
+    op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(DispatchKeySet::FULL_AFTER, key), stack);
+
+    TORCH_INTERNAL_ASSERT(mutable_inputs_with_their_clones.size() <= 1);
+
+    for (std::pair<Tensor, Tensor> mut_tensors: mutable_inputs_with_their_clones) {
+      auto& mutable_input =  mut_tensors.first;
+      auto& cloned_mutable_input =  mut_tensors.second;
+      auto& ivalue = (*stack)[stack_start];
+      auto returned_output = std::move(ivalue).toTensor();
+
+      // sanity check to ensure that the tensor in stack aliases the cloned_mutable_input
+      TORCH_INTERNAL_ASSERT(cloned_mutable_input.is_same(returned_output));
+
+      // necessary for out= arg
+      at::native::resize_output(mutable_input, returned_output.sizes());
+
+      mutable_input.copy_(returned_output);
+      (*stack)[stack_start] = std::move(mutable_input);
+    }
+  }
+
+  virtual ~MathOpFallback() = default;
+
+  DispatchKey key;
+  string op_name;
+};
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..7044b6ee3dc21abd1197850f3117e407418a13c6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MaxPooling.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Parallel.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/Pool.h>
+
+namespace at::native {
+
+inline void check_max_pool1d(
+    const Tensor& self,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    bool ceil_mode) {
+
+  TORCH_CHECK(
+      self.dim() == 2 || self.dim() == 3,
+      "max_pool1d() Expected 2D or 3D input tensor, but got ", self.sym_sizes());
+  TORCH_CHECK(
+      kernel_size.size() == 1,
+      "max_pool1d() kernel_size must be an int, list of ints or tuple of ints of size 1 but got size ",
+      kernel_size.size());
+  TORCH_CHECK(
+      stride.empty() || stride.size() == 1,
+      "max_pool1d() stride must be None, an int, list of ints, or tuple of ints of size 1 but got size ",
+      stride.size());
+  TORCH_CHECK(
+      padding.size() == 1,
+      "max_pool1d() padding must be an int, list of ints, or tuple of ints of size 1 but got size ",
+      padding.size());
+  TORCH_CHECK(
+      dilation.size() == 1,
+      "max_pool1d() dilation must be an int, list of ints or tuple of ints of size 1 but got size ",
+      dilation.size());
+
+  // If stride=None then set it to kernel_size
+  if (stride.empty()) {
+    stride = kernel_size;
+  }
+
+  TORCH_CHECK(
+      kernel_size[0] > 0,
+      "max_pool1d() kernel_size must be greater than zero, but got ",
+      kernel_size[0]);
+  TORCH_CHECK(
+      stride[0] > 0, "max_pool1d() stride must be greater than zero, but got ", stride[0]);
+  TORCH_CHECK(
+      padding[0] >= 0, "max_pool1d() padding must be non-negative, but got ", padding[0]);
+  TORCH_CHECK(
+      padding[0] <= kernel_size[0] / 2,
+      "max_pool1d() padding should be at most half of kernel size, but got padding=",
+      padding[0],
+      " and kernel_size=",
+      kernel_size[0]);
+  TORCH_CHECK(
+      dilation[0] > 0, "max_pool1d() dilation must be greater than zero, but got ", dilation[0]);
+
+  const int64_t OW = pooling_output_shape(self.sym_size(-1).guard_int(__FILE__, __LINE__), kernel_size[0], padding[0], stride[0], dilation[0], ceil_mode);
+  TORCH_CHECK(OW > 0, "max_pool1d() Invalid computed output size: ", OW);
+}
+
+// TODO(Heitor) Template by dimension
+struct PoolingParams1D {
+  int64_t NB; // Number of batches
+  int64_t NC; // Number of channels
+  int64_t IW; // Input width
+  int64_t OW; // Output width
+  int64_t KW; // Kernel width
+  int64_t SJ; // Column stride
+  int64_t PJ; // Column padding
+  int64_t DJ; // Column dilation
+
+  // Return index of input element for the given kernel and output index
+  inline int64_t index(int64_t kj, int64_t oj) const {
+    return oj * SJ + kj * DJ - PJ;
+  }
+
+  // Return index of first output within bounds for this kernel index
+  inline int64_t valid_output_start(int64_t kj) const {
+    int64_t ij = index(kj, 0);;
+    return ij < 0 ? at::divup(-ij, SJ) : 0;
+  }
+
+  // Return index one past last output within bounds for this kernel index
+  inline int64_t valid_output_end(int64_t kj) const {
+    int64_t ij = index(kj, OW - 1);
+    return ij >= IW ? OW - at::divup(ij - (IW - 1), SJ) : OW;
+  }
+};
+
+using pooling_fn = void (*)(Tensor&, const Tensor&, const PoolingParams1D&);
+
+DECLARE_DISPATCH(pooling_fn, max_pool1d_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonSymbolicBC.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonSymbolicBC.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbcfd3a13f9c3c71a5291fdad2e6b95b186cf681
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonSymbolicBC.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <ATen/core/IListRef.h>
+
+namespace at::native {
+// This file contains non-symbolic signatures for ops that we have sym-intified the signature of.
+// However, in certain cases (such as static runtime), we call the native versions of the ops directly.
+// In those cases, we will duplicate the signature here with non-symbolic ints, and also duplicate the C++ implementation.
+TORCH_API at::Tensor reshape(const at::Tensor& self, at::IntArrayRef proposed_shape);
+TORCH_API at::Tensor narrow(const at::Tensor& self, int64_t dim, int64_t start, int64_t length);
+TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, std::optional<at::ScalarType> dtype=std::nullopt, std::optional<at::Layout> layout=std::nullopt, std::optional<at::Device> device=std::nullopt, std::optional<bool> pin_memory=std::nullopt, std::optional<bool> is_coalesced=std::nullopt);
+TORCH_API at::Tensor nll_loss(const at::Tensor & self, const at::Tensor & target, const std::optional<at::Tensor>& weight_opt, int64_t reduction, int64_t ignore_index);
+TORCH_API at::Tensor nll_loss2d(const at::Tensor & self, const at::Tensor & target, const std::optional<at::Tensor>& weight_opt, int64_t reduction, int64_t ignore_index);
+// The below ops don't get a duplicated C++ implementation.
+// They are backward ops, which make them very unlikely to be called directly
+// by external code (at::native::trace_backward).
+// They get their own declaration for BC purposes however.
+TORCH_API at::Tensor _embedding_bag_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, bool sparse, const std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const std::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor value_selecting_reduction_backward(const at::Tensor & grad, int64_t dim, const at::Tensor & indices, at::IntArrayRef sizes, bool keepdim);
+TORCH_API at::Tensor trace_backward(const at::Tensor & grad, at::IntArrayRef sizes);
+TORCH_API at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index);
+TORCH_API at::Tensor select(const at::Tensor& self, int64_t dim, int64_t index);
+TORCH_API std::vector<Tensor> tensor_split(const Tensor& self, IntArrayRef indices, int64_t dim);
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RangeFactories.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RangeFactories.h
new file mode 100644
index 0000000000000000000000000000000000000000..df3b43856e0980841cace5500a3e009e1501c8a0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/RangeFactories.h
@@ -0,0 +1,12 @@
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+
+namespace at {
+struct TensorIterator;
+
+namespace native {
+
+DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, const Scalar&), arange_stub);
+DECLARE_DISPATCH(void(*)(TensorIterator&, const Scalar&, const Scalar&, int64_t), linspace_stub);
+
+}}  // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceAllOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceAllOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3ece0328fe35f5b32faae79d8a66020f5045b1e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceAllOps.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+class Tensor;
+}
+
+namespace at::native {
+
+using reduce_all_fn = void (*)(Tensor & result, const Tensor & self);
+using reduce_min_max_fn = void (*)(Tensor & max_result, Tensor & min_result, const Tensor & self);
+DECLARE_DISPATCH(reduce_all_fn, min_all_stub);
+DECLARE_DISPATCH(reduce_all_fn, max_all_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReductionType.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReductionType.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe28f21cb750139f583af1951cbeaa77fa0a7ed0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReductionType.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <c10/core/Scalar.h>
+
+namespace at::native {
+
+enum class ReductionType {MAX, MEAN, MIN, SUM, PROD};
+
+inline ReductionType get_reduction_enum(const c10::string_view& reduce) {
+  if (reduce == "max" || reduce == "amax") {
+    return ReductionType::MAX;
+  } else if (reduce == "mean") {
+    return ReductionType::MEAN;
+  } else if (reduce == "min" || reduce == "amin") {
+    return ReductionType::MIN;
+  } else if (reduce == "sum") {
+    return ReductionType::SUM;
+  } else if (reduce == "prod") {
+    return ReductionType::PROD;
+  } else {
+    TORCH_CHECK(false, "reduce argument must be either sum, prod, mean, amax or amin, got ", reduce);
+  }
+}
+
+// used for `scatter_reduce`, old options for BC.
+inline ReductionType get_operator_enum(const c10::string_view reduce, bool use_new_options) {
+  if (use_new_options) {
+    return get_reduction_enum(reduce);
+  } else {
+    if (reduce == "add") {
+      return ReductionType::SUM;
+    } else if (reduce == "multiply") {
+      return ReductionType::PROD;
+    } else {
+      TORCH_CHECK(false, "reduce argument must be either add or multiply.")
+    }
+  }
+}
+
+} // at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8d6f92553a4c9bde2f90574a23108fd9667a26c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Repeat.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorOperators.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_like.h>
+#endif
+
+namespace at::native {
+
+template <
+    typename index_t,
+    void compute(const index_t*, const int64_t*, index_t*, int64_t, int64_t)>
+static inline Tensor repeat_interleave_common(
+    const Tensor& repeats,
+    std::optional<int64_t> output_size) {
+  TORCH_CHECK(
+      repeats.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
+  TORCH_CHECK(
+      repeats.scalar_type() == at::kLong || repeats.scalar_type() == at::kInt,
+      "repeats has to be Long or Int tensor");
+  if (repeats.size(0) == 0) {
+    return at::empty_like(repeats, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  Tensor repeats_ = repeats.contiguous();
+  Tensor cumsum = repeats.cumsum(0);
+  int64_t total = 0;
+  if (output_size.has_value()) {
+    total = output_size.value();
+  } else {
+    total = cumsum[-1].item<int64_t>();
+    TORCH_CHECK(
+        (repeats >= 0).all().item<uint8_t>(), "repeats can not be negative");
+  }
+
+  Tensor result = at::empty({total}, repeats.options());
+  const index_t* repeat_ptr = repeats_.const_data_ptr<index_t>();
+  const int64_t* cumsum_ptr = cumsum.const_data_ptr<int64_t>();
+  index_t* result_ptr = result.data_ptr<index_t>();
+  compute(repeat_ptr, cumsum_ptr, result_ptr, repeats.size(0), total);
+  return result;
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h
new file mode 100644
index 0000000000000000000000000000000000000000..951a08e84c7793d33b76203d0dfb738fa91056e0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Resize.h
@@ -0,0 +1,173 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/ResizeCommon.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorUtils.h>
+
+#include <c10/core/CPUAllocator.h>
+
+#include <utility>
+
+
+namespace at::native {
+
+// TODO: make all operations that resize given outputs use this function
+//   for consistency and maintainability.
+//   Some operations like `cat` might not be able to make the use of
+//   resize_output directly. For more details to understand how it works in `cat`,
+//   see https://github.com/pytorch/pytorch/pull/62560#discussion_r687363362
+// Resizes outputs
+// Functions accepting output tensors, like with the "out" kwarg, should
+//   call this function to handle resizing their output tensor.
+// Issues a warning if the output tensor has one or more elements and
+//   needs resizing
+// NOTE: In the future the warning will become an error
+// Returns a bool saying whether or not the resize actually happened or not
+TORCH_API bool resize_output(const Tensor& output, IntArrayRef shape);
+// WARNING: Do NOT call this directly. If you are resizing an output and want
+// to support dynamic shapes call at::resize__symint and resize_output_check_symint.
+// For more details, see: https://github.com/pytorch/pytorch/pull/111530/files#r1365845272
+TORCH_API bool resize_output_symint(const Tensor& output, SymIntArrayRef shape);
+
+// Utility for resize_output
+//  Returns a bool saying resize should happen or not and
+//  raises a warning if resizing for one or more elements
+TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape);
+TORCH_API bool resize_output_check_symint(const Tensor& output, SymIntArrayRef shape);
+
+TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes);
+TORCH_API void resize_bytes_meta(StorageImpl* storage, c10::SymInt size_bytes);
+TORCH_API void resize_bytes_nocuda(const Storage& storage, const c10::SymInt& size_bytes);
+
+inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) {
+  // It does not make sense to try to resize a storage
+  // to hold 0 elements, and this can break
+  // if storage_offset is positive but
+  // new_size is 0, so just bail in that case
+  // (same comment is in cuda/Resize.h)
+  if (self->numel() == 0) {
+    return;
+  }
+
+  const Storage& storage = self->unsafe_storage();
+  if (!storage) {
+    auto new_storage = c10::make_intrusive<StorageImpl>(
+        StorageImpl::use_byte_size_t(),
+        new_size_bytes,
+        c10::GetCPUAllocator(),
+        true);
+    self->set_storage_keep_dtype(std::move(new_storage));
+  } else if (new_size_bytes > storage.nbytes()) {
+    resize_bytes_cpu(storage.unsafeGetStorageImpl(), new_size_bytes);
+  }
+}
+
+TORCH_API TensorImpl* resize_impl_cpu_(
+    TensorImpl* self,
+    IntArrayRef size,
+    at::OptionalIntArrayRef stride,
+    bool resize_storage = true);
+
+template <typename T>
+T maybe_convert_symint(c10::SymInt) = delete;
+
+template <>
+inline c10::SymInt maybe_convert_symint(c10::SymInt x) { return x; }
+
+template <>
+inline int64_t maybe_convert_symint(c10::SymInt x) { return x.guard_int(__FILE__, __LINE__); }
+
+template <typename T>
+inline void checkInBoundsForStorage(
+    ArrayRef<T> size,
+    ArrayRef<T> stride,
+    T storage_offset,
+    const caffe2::TypeMeta& data_type,
+    const Storage& new_storage) {
+  T storage_size_bytes =
+      at::detail::computeStorageNbytes(size, stride, data_type.itemsize());
+  T storage_offset_bytes = storage_offset * data_type.itemsize();
+  if (storage_size_bytes == 0) {
+    // NB: (a tensor with arbitrary 0 dims)'s storage can have any numel.
+    return;
+  }
+  T new_storage_size_bytes = maybe_convert_symint<T>(new_storage.sym_nbytes());
+  TORCH_CHECK(
+      storage_size_bytes + storage_offset_bytes <= new_storage_size_bytes,
+      "setStorage: sizes ",
+      size,
+      ", strides ",
+      stride,
+      ","
+      " storage offset ",
+      storage_offset,
+      ", and itemsize ",
+      data_type.itemsize(),
+      " requiring a storage size of ",
+      storage_size_bytes + storage_offset_bytes,
+      " are out of bounds for storage of size ",
+      new_storage_size_bytes);
+}
+
+template <typename T>
+inline void checkSetStorage(Tensor& result, Storage storage, T storage_offset,
+                                   ArrayRef<T> size, ArrayRef<T> stride) {
+  // FIXME: stride should be optional
+  if (stride.data()) {
+    TORCH_CHECK(size.size() == stride.size(), "unequal size length (", size.size(),
+                                              ") and stride length (", stride.size(), ")");
+  }
+
+#ifdef DEBUG
+  TORCH_CHECK(size.size() <= INT_MAX, "size length (", size.size(), ") greater than INT_MAX");
+#endif
+
+  // storage: note this can't be replaced with result.set_(storage) as the semantics of that
+  // function is to set the tensor size to be equal to the size of the storage.
+  if (!result.storage().is_alias_of(storage)) {
+    // Caffe2 might have tensors whose storages are null, but we
+    // don't allow it in PyTorch.
+    TORCH_INTERNAL_ASSERT(storage);
+    TORCH_INTERNAL_ASSERT(result.storage());
+
+    // We used to allow this, but this breaks device caching.
+    // Let's put an actual error message for this one.
+    TORCH_CHECK(result.storage().device() == storage.device(),
+                "Attempted to set the storage of a tensor on device \"", result.storage().device(),
+                "\" to a storage on different device \"", storage.device(),
+                "\".  This is no longer allowed; the devices must match.");
+    result.unsafeGetTensorImpl()->set_storage_keep_dtype(std::move(storage));
+  }
+
+  // storageOffset
+  TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
+}
+
+/**
+ * Set self's sizes, strides, and storage_offset.
+ * (size, stride, storage_offset) must be in bounds for self's storage.
+ */
+template <typename T>
+inline void setStrided(
+    const Tensor& self,
+    ArrayRef<T> size,
+    ArrayRef<T> stride,
+    T storage_offset) {
+  TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape");
+  for (const auto& val : stride) {
+    TORCH_CHECK(val >= 0,
+                "as_strided: Negative strides are not supported at the moment, "
+                "got strides: ", stride);
+  }
+
+  auto* self_ = self.unsafeGetTensorImpl();
+  checkInBoundsForStorage(
+      size, stride, storage_offset, self_->dtype(), self_->storage());
+
+  /* storage offset */
+  TORCH_CHECK(storage_offset >= 0, "Tensor: invalid storage offset ", storage_offset);
+  self_->set_sizes_and_strides(size, stride, std::make_optional(storage_offset));
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h
new file mode 100644
index 0000000000000000000000000000000000000000..cea2612a22127d00532da761cf8e773164db5ca4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ResizeCommon.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/TensorFactories.h>
+#include <ATen/NamedTensorUtils.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::native {
+
+template <typename T>
+inline T storage_size_for(ArrayRef<T> size, ArrayRef<T> stride) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(size.size() == stride.size(),
+      "storage_size_for(size, stride) requires that size and stride ",
+      "have the same size as a precondition.");
+  T storage_size = 1;
+  for (const auto dim : c10::irange(size.size())) {
+    if (size[dim] == 0) {
+      storage_size = 0;
+      break;
+    }
+    storage_size += (size[dim] - 1) * stride[dim];
+  }
+  return storage_size;
+}
+
+inline const Tensor& resize_named_tensor_(
+    const Tensor& self,
+    IntArrayRef size,
+    std::optional<MemoryFormat> optional_memory_format) {
+  TORCH_INTERNAL_ASSERT(self.has_names());
+  TORCH_CHECK(
+      self.sizes() == size,
+      "Cannot resize named tensor with resize_ or resize_as_ (tried to resize "
+      "Tensor",
+      self.names(),
+      " with size ",
+      self.sizes(),
+      " to ",
+      size,
+      "). This may be caused by passing a named tensor ",
+      "as an `out=` argument; please ensure that the sizes are the same. ");
+  TORCH_CHECK(
+      !optional_memory_format.has_value(),
+      "Unsupported memory format for named tensor resize ",
+      optional_memory_format.value());
+  return self;
+}
+
+// For deterministic output, fill new elements that were added after a storage
+// resize with NaN or MAX_INT. `old_storage_nbytes` is the size of the storage
+// before the resize happened.
+inline const Tensor& fill_resize_deterministic_(const Tensor& tensor, int64_t old_storage_nbytes) {
+  const at::Storage& storage = tensor.unsafeGetTensorImpl()->unsafe_storage();
+  int64_t new_storage_nbytes = storage.nbytes();
+  int64_t old_storage_numel = old_storage_nbytes / tensor.itemsize();
+  int64_t new_storage_numel = new_storage_nbytes / tensor.itemsize();
+  if (new_storage_numel > old_storage_numel) {
+    at::Tensor tensor_view = at::empty({}, at::TensorOptions().dtype(tensor.scalar_type()).device(tensor.device()));
+    tensor_view.set_(
+      storage,
+      /*storage_offset=*/old_storage_numel,
+      /*size=*/{new_storage_numel - old_storage_numel},
+      /*stride=*/{1});
+    at::native::fill_empty_deterministic_(tensor_view);
+  }
+  return tensor;
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ScatterGatherChecks.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ScatterGatherChecks.h
new file mode 100644
index 0000000000000000000000000000000000000000..94816f5cfb6c7368fc616fe23868740e8bc242f6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ScatterGatherChecks.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <vector>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/ReduceOpsUtils.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+
+namespace {
+
+// checks whether index.dtype == int64
+// and self.dtype == src.dtype if src is a Tensor
+inline void scatter_gather_dtype_check(
+  const std::string& method_name,
+  const Tensor& self,
+  const Tensor& index,
+  const std::optional<Tensor>& src_opt = std::nullopt
+) {
+  if (index.numel() != 0) {
+    TORCH_CHECK(
+      index.scalar_type() == at::ScalarType::Long,
+      method_name, "(): Expected dtype int64 for index"
+    );
+  }
+
+  if (src_opt.has_value()) {
+    const auto& src = src_opt.value();
+    TORCH_CHECK(
+      self.scalar_type() == src.scalar_type(),
+      method_name, "(): Expected self.dtype to be equal to src.dtype"
+    );
+  }
+}
+
+// Used for `gather`-like methods
+// Note: self means the input tensor here
+// Test:
+// 1. index.size(d) <= self.size(d) for all d != dim
+// 2. index.dim() == self.dim()
+inline void gather_shape_check(const Tensor& self, int64_t dim,
+  const Tensor& index
+) {
+  auto self_dims = ensure_nonempty_dim(self.dim());
+  TORCH_CHECK(self_dims == ensure_nonempty_dim(index.dim()),
+    "Index tensor must have the same number of dimensions as input tensor"
+  );
+
+  for (const auto i : c10::irange(self_dims)) {
+    if (i != dim) {
+      TORCH_CHECK(
+        ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i),
+        "Size does not match at dimension ", i,
+        " expected index ", index.sizes(),
+        " to be smaller than self ", self.sizes(),
+        " apart from dimension ", dim
+      );
+    }
+  }
+}
+
+// Used for `scatter` and `scatter_add`
+// Tests:
+//  1. index.size(d) <= self.size(d) for all d != dim
+//  2. index.size(d) <= src.size(d) for all d if src is a Tensor
+//  3. index.dim() == self.dim() == src.dim()
+inline void scatter_shape_check(
+  const Tensor& self, int64_t dim, const Tensor& index,
+  const std::optional<Tensor>& src_opt = std::nullopt
+) {
+  if (index.numel() == 0) return;
+  TORCH_CHECK(
+    ensure_nonempty_dim(self.dim()) == ensure_nonempty_dim(index.dim()),
+    "Index tensor must have the same number of dimensions as self tensor"
+  );
+
+  bool is_wrong_shape = false;
+  int64_t self_dims = ensure_nonempty_dim(self.dim());
+
+  //  Check: index.size(d) <= self.size(d) for all d != dim
+  for (const auto d : c10::irange(self_dims)) {
+    int64_t index_d_size = ensure_nonempty_size(index, d);
+    if (d == dim) continue;
+    if (index_d_size > ensure_nonempty_size(self, d)) {
+      is_wrong_shape = true;
+      break;
+    }
+  }
+
+  //  Check: index.size(d) <= src.size(d) for all d if src is Tensor
+  if (!is_wrong_shape && src_opt.has_value()) {
+    const auto& src = src_opt.value();
+    for (const auto d : c10::irange(self_dims)) {
+      int64_t index_d_size = ensure_nonempty_size(index, d);
+      if (index_d_size > ensure_nonempty_size(src, d)) {
+        is_wrong_shape = true;
+        break;
+      }
+    }
+  }
+
+  if (src_opt.has_value()) {
+    const auto& src = src_opt.value();
+
+    TORCH_CHECK(
+      ensure_nonempty_dim(src.dim()) == ensure_nonempty_dim(index.dim()),
+      "Index tensor must have the same number of dimensions as src tensor"
+    );
+
+    TORCH_CHECK(!is_wrong_shape,
+      "Expected index ", index.sizes(),
+      " to be smaller than self ", self.sizes(),
+      " apart from dimension ", dim,
+      " and to be smaller size than src ", src.sizes()
+    );
+  }
+  else {
+    TORCH_CHECK(!is_wrong_shape,
+      "Expected index ", index.sizes(),
+      " to be smaller than self ", self.sizes(),
+      " apart from dimension ", dim
+    );
+  }
+}
+
+} // anonymous namespace
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b7167ee93dd29edd028a76040a6f1937c2de35d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SharedReduceOps.h
@@ -0,0 +1,544 @@
+#pragma once
+// Please note that this file is
+// used across both CPU and GPU.
+
+#include <type_traits>
+#include <complex>
+#include <c10/macros/Macros.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/NumericUtils.h>
+#if defined(__CUDACC__)
+#include <ATen/cuda/DeviceUtils.cuh>
+#include <ATen/native/cuda/DeviceSqrt.cuh>
+#elif defined(__HIPCC__)
+#include <ATen/hip/DeviceUtils.cuh>
+#include <ATen/native/hip/DeviceSqrt.cuh>
+#endif
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#include <thrust/pair.h>
+#else
+#include <cmath>
+#define device_sqrt std::sqrt
+#endif
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename scalar_t>
+inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
+#if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+  scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b));
+#else
+  scalar_t max = at::_isnan(b) ? b : std::max(a, b);
+#endif
+  return max;
+}
+template <typename scalar_t>
+inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
+#if defined(__HIPCC__)
+  // TODO: remove this special case for HIP when issue is fixed:
+  //       https://github.com/ROCm-Developer-Tools/HIP/issues/2209
+  scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b));
+#else
+  scalar_t min = at::_isnan(b) ? b : std::min(a, b);
+#endif
+  return min;
+}
+#define MAX(X, Y) max_propagate_nan(X,Y)
+#define MIN(X, Y) min_propagate_nan(X,Y)
+#else
+#include <ATen/native/cpu/zmath.h>
+#define MAX(X, Y) max_impl(X,Y)
+#define MIN(X, Y) min_impl(X,Y)
+#endif
+
+// ROCM hcc doesn't work well with using std:: in kernel functions
+#if defined(__CUDA_ARCH__)
+#include <c10/cuda/CUDAMathCompat.h>
+#define compat_pow c10::cuda::compat::pow
+#elif defined(__HIPCC__)
+#include <c10/hip/HIPMathCompat.h>
+#define compat_pow c10::hip::compat::pow
+#else
+#define compat_pow std::pow
+#endif
+
+namespace at { namespace native {
+
+namespace detail {
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+template <typename T1, typename T2> using pair = thrust::pair<T1, T2>;
+#else
+template <typename T1, typename T2> using pair = std::pair<T1, T2>;
+#endif
+
+} // namespace detail
+
+template <typename scalar_t, typename index_t>
+struct WelfordData {
+  scalar_t mean;
+  scalar_t m2;
+  index_t n;
+  scalar_t nf;
+
+  C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {}
+
+  C10_HOST_DEVICE WelfordData(
+      scalar_t mean,
+      scalar_t m2,
+      index_t n,
+      scalar_t nf)
+      : mean(mean), m2(m2), n(n), nf(nf) {}
+};
+
+
+template <typename scalar_t, typename acc_scalar_t, typename index_t, typename res_t>
+struct WelfordOps {
+  acc_scalar_t correction;
+  bool take_sqrt;
+ public:
+  using acc_t = WelfordData<acc_scalar_t, index_t>;
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
+    // We accumulate n in index_t to avoid cumulative rounding error, but still
+    // need nf for use in combine where int32 may overflow.
+    index_t new_n = acc.n + 1;
+    acc_scalar_t new_nf = static_cast<acc_scalar_t>(new_n);
+    acc_scalar_t delta = data - acc.mean;
+    acc_scalar_t new_mean = acc.mean + delta / new_nf;
+    acc_scalar_t new_delta = data - new_mean;
+    return {
+      new_mean,
+      acc.m2 + delta * new_delta,
+      new_n,
+      new_nf,
+    };
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    if (a.nf == 0) {
+      return b;
+    }
+    if (b.nf == 0) {
+      return a;
+    }
+    acc_scalar_t delta = b.mean - a.mean;
+    acc_scalar_t new_count = a.nf + b.nf;
+    acc_scalar_t nb_over_n = b.nf / new_count;
+    return {
+      a.mean + delta * nb_over_n,
+      a.m2 + b.m2 + delta * delta * a.nf * nb_over_n,
+      // setting acc.n as -1 since acc.n might not be able to represent the count
+      // correctly within its range, setting it to -1 to avoid confusion
+      -1,
+      new_count
+    };
+  }
+  inline C10_DEVICE res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ {
+    const auto mean = static_cast<scalar_t>(acc.mean);
+    const auto divisor = acc.nf > correction ? acc.nf - correction : 0;
+    const auto var = acc.m2 / divisor;
+    res_t results(take_sqrt ? device_sqrt(var) : var, mean);
+    return results;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return {
+      WARP_SHFL_DOWN(acc.mean, offset)
+      , WARP_SHFL_DOWN(acc.m2, offset)
+      , WARP_SHFL_DOWN(acc.n, offset)
+      , WARP_SHFL_DOWN(acc.nf, offset)
+    };
+  }
+#endif
+  C10_HOST_DEVICE WelfordOps(acc_scalar_t correction, bool take_sqrt)
+      : correction(correction), take_sqrt(take_sqrt) {}
+};
+
+template <typename scalar_t, typename acc_t=scalar_t, typename factor_t=acc_t, typename out_t = acc_t>
+struct MeanOps {
+  factor_t factor;
+
+  inline C10_DEVICE acc_t reduce(acc_t a, scalar_t b, int64_t /*idx*/) const {
+    return combine(a, static_cast<acc_t>(b));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a * factor;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+
+  MeanOps(factor_t factor): factor(factor) {
+  }
+};
+
+// This accumulator template is used to calculate the minimum absolute value of
+// a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct AbsMinOps {
+
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return MIN(acc, static_cast<acc_t>(std::abs(data)));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return MIN(a, b);
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+// This accumulator template is used to calculate the maximum absolute value of
+// a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct AbsMaxOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return MAX(acc, static_cast<acc_t>(std::abs(data)));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return MAX(a, b);
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+// This accumulator template is used to calculate the norm of the absolute value
+// of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormOps {
+  acc_t norm_;
+
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + compat_pow(static_cast<acc_t>(std::abs(data)), norm_);
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return compat_pow(a, static_cast<acc_t>(1.0) / norm_);
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+
+  NormOps(acc_t norm_): norm_(norm_) {
+  }
+};
+
+// This accumulator template is used to calculate the order zero norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormZeroOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + (data == static_cast<scalar_t>(0) ? static_cast<acc_t>(0) : static_cast<acc_t>(1));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+// This accumulator template is used to calculate the order one norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormOneOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    return acc + static_cast<acc_t>(std::abs(data));
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return a;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+
+template<typename acc_t>
+struct AbsSwitch {};
+
+template<typename scalar_t, typename acc_t>
+inline C10_DEVICE acc_t abs_if_complex(scalar_t data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(data);
+}
+
+template<typename scalar_t, typename acc_t>
+inline C10_DEVICE acc_t abs_if_complex(std::complex<scalar_t> data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(std::abs(data));
+}
+
+template<typename scalar_t, typename acc_t>
+inline C10_DEVICE acc_t abs_if_complex(c10::complex<scalar_t> data, AbsSwitch<acc_t>) {
+  return static_cast<acc_t>(std::abs(data));
+}
+
+// This accumulator template is used to calculate the order two norm of the
+// absolute value of a set of numbers.
+// `scalar_t` is the type of the input and `acc_t` is the type of the accumulated
+// value. These types differ for complex number input support.
+template <typename scalar_t, typename acc_t = scalar_t, typename out_t = acc_t>
+struct NormTwoOps {
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const {
+    acc_t data_ = abs_if_complex(data, AbsSwitch<acc_t>());
+    return acc + data_ * data_;
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return a + b;
+  }
+
+  inline C10_DEVICE out_t project(acc_t a) const {
+    return device_sqrt(a);
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return WARP_SHFL_DOWN(acc, offset);
+  }
+#endif
+};
+
+template <typename acc_t, typename data_t>
+struct NanSumOps {
+  inline C10_DEVICE acc_t reduce(acc_t a, data_t b, int64_t /*idx*/) const {
+    return a + (at::_isnan(b) ? acc_t{0.} : acc_t{b});
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    return  a + b;
+  }
+
+  inline C10_DEVICE data_t project(acc_t a) const {
+    return data_t{a};
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
+    return WARP_SHFL_DOWN(data, offset);
+  }
+#endif
+};
+
+namespace detail {
+
+template <typename scalar_t>
+struct LessOrNan {
+  C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
+    // If (a == b), then choose the one with lower idx, else min(a, b)
+    if (at::_isnan(a)) {
+      if (at::_isnan(b)) {
+        return idx_a < idx_b;
+      }
+      return true;
+    }
+    return (a == b) ? idx_a < idx_b : (a < b);
+  }
+};
+
+template <typename scalar_t>
+struct GreaterOrNan {
+  C10_DEVICE bool operator () (scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) const {
+    // If (a == b), then choose the one with lower idx, else max(a, b)
+    if (at::_isnan(a)) {
+      if (at::_isnan(b)) {
+        return idx_a < idx_b;
+      }
+      return true;
+    }
+    return (a == b) ? idx_a < idx_b : (a > b);
+  }
+};
+
+template <typename comp_t>
+struct MinMaxReductionOps {
+  using scalar_t = typename binary_function_traits<comp_t>::arg1_t;
+  using index_t = int64_t;
+  using arg_t = detail::pair<scalar_t, index_t>;
+
+  static C10_DEVICE arg_t project(arg_t arg) {
+    return arg;
+  }
+
+  static C10_DEVICE arg_t reduce(arg_t arg, scalar_t val, int64_t idx) {
+    return comp_t{}(arg.first, val, arg.second, idx) ? arg : arg_t(val, idx);
+  }
+
+  static C10_DEVICE arg_t combine(arg_t a, arg_t b) {
+    return comp_t{}(a.first, b.first, a.second, b.second) ? a : b;
+  }
+
+  static C10_DEVICE arg_t translate_idx(arg_t a, int64_t base_idx) {
+    return {a.first, a.second + base_idx};
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  static C10_DEVICE arg_t warp_shfl_down(arg_t arg, int offset) {
+    return arg_t(WARP_SHFL_DOWN(arg.first, offset),
+                 WARP_SHFL_DOWN(arg.second, offset));
+  }
+#endif
+};
+
+template <typename comp_t>
+struct ArgReductionOps : public MinMaxReductionOps<comp_t> {
+  using typename MinMaxReductionOps<comp_t>::scalar_t;
+  using typename MinMaxReductionOps<comp_t>::index_t;
+  using typename MinMaxReductionOps<comp_t>::arg_t;
+
+  static C10_DEVICE index_t project(arg_t arg) {
+    return arg.second;
+  }
+};
+
+} // namespace detail
+
+template <typename scalar_t>
+struct ArgMaxOps :
+  public detail::ArgReductionOps<detail::GreaterOrNan<scalar_t>> {
+};
+
+template <typename scalar_t>
+struct ArgMinOps :
+  public detail::ArgReductionOps<detail::LessOrNan<scalar_t>> {
+};
+
+template <typename scalar_t>
+struct MinOps :
+  public detail::MinMaxReductionOps<detail::LessOrNan<scalar_t>> {
+};
+
+template <typename scalar_t>
+struct MaxOps :
+  public detail::MinMaxReductionOps<detail::GreaterOrNan<scalar_t>> {
+};
+
+template <typename scalar_t, typename acc_scalar_t, typename index_t>
+struct MinMaxOps {
+  using acc_t = detail::pair<acc_scalar_t, acc_scalar_t>;
+  inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, index_t /*idx*/) const {
+    return combine(acc, {data, data});
+  }
+
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    auto min_val = (at::_isnan(a.first) || a.first < b.first) ? a.first : b.first;
+    auto max_val = (at::_isnan(a.second) || a.second > b.second) ? a.second : b.second;
+
+    return {min_val, max_val};
+  }
+
+  inline C10_DEVICE acc_t project(acc_t acc) const {
+    return acc;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return {
+      WARP_SHFL_DOWN(acc.first, offset), WARP_SHFL_DOWN(acc.second, offset)
+    };
+  }
+#endif
+};
+
+}} // namespace at::native
+
+#undef MAX
+#undef MIN
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Sorting.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Sorting.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ab806645fbf144fe9df5f41497f4ca753e8573f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Sorting.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at::native {
+
+enum class QUANTILE_INTERPOLATION_MODE : uint8_t {
+  LINEAR,
+  LOWER,
+  HIGHER,
+  MIDPOINT,
+  NEAREST
+};
+
+using sort_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, bool, bool);
+using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool);
+
+DECLARE_DISPATCH(sort_fn, sort_stub);
+DECLARE_DISPATCH(topk_fn, topk_stub);
+
+void _fill_indices(const TensorBase &indices, int64_t dim);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SortingUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SortingUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0f9cfa8bf0ce554d198b7e55079ed13e1798175
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SortingUtils.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <ATen/NumericUtils.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+namespace at::native {
+
+// ensure we get good values and indices for kthvalue, mode
+// this will always be with the reducing dim as 1-d
+inline void _reduction_with_indices_allocate_or_resize_output(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim_,
+    bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  auto result_sizes = self.sizes().vec();
+  if (!result_sizes.empty()) {
+    result_sizes[dim] = 1;
+  }
+  if (values.defined()) {
+    TORCH_CHECK(
+        self.options().type_equal(values.options()),
+        "output values must be of same type as input");
+    if (!keepdim && values.dim() == self.dim() - 1) {
+      // unsqueeze to preserve passed in noncontiguous tensor in resize
+      values.unsqueeze_(dim);
+    }
+    resize_output(values, result_sizes);
+  } else {
+    values = at::empty(result_sizes, self.options());
+  }
+  if (indices.defined()) {
+    TORCH_CHECK(
+        indices.dtype() == kLong, "output indices must be of scalar type Long");
+    TORCH_CHECK(
+        indices.device() == self.device(),
+        "output indices must be on same device as input");
+    if (!keepdim && indices.dim() == self.dim() - 1) {
+      // unsqueeze to preserve passed in noncontiguous tensor in resize
+      indices.unsqueeze_(dim);
+    }
+    resize_output(indices, result_sizes);
+  } else {
+    indices = at::empty(result_sizes, self.options().dtype(kLong));
+  }
+}
+
+// ensure we get good values and indices for topk
+inline void _allocate_or_resize_output_with_indices(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim_,
+    int64_t k) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  auto result_sizes = self.sizes().vec();
+  if (!result_sizes.empty()) {
+    result_sizes[dim] = k;
+  }
+  if (values.defined()) {
+    TORCH_CHECK(
+        self.options().type_equal(values.options()),
+        "output values must be of same type as input");
+    values.resize_(result_sizes);
+  } else {
+    values = at::empty(result_sizes, self.options());
+  }
+  if (indices.defined()) {
+    TORCH_CHECK(
+        indices.dtype() == kLong, "output indices must be of scalar type Long");
+    TORCH_CHECK(
+        indices.device() == self.device(),
+        "output indices must be on same device as input");
+    indices.resize_(result_sizes);
+  } else {
+    indices = at::empty(result_sizes, self.options().dtype(kLong));
+  }
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3953ca0d17b57f2547bafd2d08bec577b83b713e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SparseTensorUtils.h
@@ -0,0 +1,190 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <ATen/SparseTensorImpl.h>
+#include <ATen/core/Tensor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/tensor.h>
+#endif
+
+namespace at::sparse {
+
+// Just for documentary purposes
+using SparseTensor = Tensor;
+using SparseType = Type;
+
+// This is an internal utility function for getting at the SparseTensorImpl,
+// so that we can write sparse tensor specific accessors for special fields
+// in SparseTensor.  You should only use this for writing low level
+// setters/getters for SparseTensorImpl fields; otherwise, you should use
+// the low level setters/getters that were implemented using this.
+//
+// This may be called repeatedly, so make sure it's pretty cheap.
+inline SparseTensorImpl* get_sparse_impl(const SparseTensor& self) {
+  TORCH_INTERNAL_ASSERT(
+      self.is_sparse(), "_internal_get_SparseTensorImpl: not a sparse tensor");
+  return static_cast<SparseTensorImpl*>(self.unsafeGetTensorImpl());
+}
+
+// Takes indices and values and directly puts them into the sparse tensor, no
+// copy.  This used to be called THSTensor_(_move)
+inline void alias_into_sparse(
+    const SparseTensor& self,
+    const Tensor& indices,
+    const Tensor& values) {
+  get_sparse_impl(self)->set_indices_and_values_unsafe(indices, values);
+}
+
+// Take indices and values and makes a (data) copy of them to put into the
+// sparse indices/values.  This used to be called THSTensor_(_set)
+inline void copy_into_sparse(
+    const SparseTensor& self,
+    const Tensor& indices,
+    const Tensor& values,
+    bool non_blocking) {
+  alias_into_sparse(
+      self,
+      indices.to(self._indices().options(), non_blocking, /*copy=*/true),
+      values.to(self._values().options(), non_blocking, /*copy=*/true));
+}
+
+// TODO: put this into the public API
+inline bool is_same_tensor(const Tensor& lhs, const Tensor& rhs) {
+  return lhs.unsafeGetTensorImpl() == rhs.unsafeGetTensorImpl();
+}
+
+inline bool is_same_density(const SparseTensor& self, const SparseTensor& src) {
+  return self.sparse_dim() == src.sparse_dim() &&
+      self.dense_dim() == src.dense_dim();
+}
+
+// Give us a new values tensor, with the same dimensionality
+// as 'values' but with a new number of non-zero elements.
+// TODO: Expose this for real in ATen, some day?
+// NB: Doesn't preserve data.
+inline Tensor new_values_with_size_of(const Tensor& values, int64_t nnz) {
+  std::vector<int64_t> size = values.sizes().vec();
+  size[0] = nnz;
+  return at::empty(size, values.options());
+}
+
+// NOTE [ Flatten Sparse Indices ]
+// This helper function flattens a sparse indices tensor (a Tensor) into a 1D
+// indices tensor. E.g.,
+//   input = [[2, 4, 0],
+//            [3, 1, 10]]
+//   full_size = [2, 12]
+//   output = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 10 ] = [27, 49, 10]
+//
+// In other words, assuming that each `indices[i, :]` is a valid index to a
+// tensor `t` of shape `full_size`. This returns the corresponding indices to
+// the flattened tensor `t.reshape( prod(full_size[:indices.size(0)]), -1 )`.
+// if forceClone is true, the result will forced to be a clone of self.
+// if force_clone is true, the result will forced to be a clone of self.
+TORCH_API Tensor flatten_indices(
+    const Tensor& indices,
+    IntArrayRef full_size,
+    bool force_clone = false);
+
+// Flatten sparse tensor's indices from nD to 1D, similar to NOTE [ Flatten
+// Sparse Indices ], except this one allows partial flatten: only flatten on
+// specified dims. Note that the flatten indices might be uncoalesced if
+// dims_to_flatten.size() < sparse_dim. Also if input indices is already
+// coalesced, the flattened indices will also be sorted.
+//
+// args:
+//    indices: sparse tensor indices
+//    sizes: sparse tensor sizes
+//    dims_to_flatten: a list of dim index to flatten
+//
+// Ex1:
+//   indices = [[2, 4, 0],
+//             [3, 1, 3]]
+//   sizes = [2, 12]
+//   dims_to_flatten = [0, 1]
+//   new_indices = [ 2 * 12 + 3, 4 * 12 + 1, 0 * 12 + 3 ] = [27, 49, 3]
+//
+// Ex2:
+//   dims_to_flatten = [1]
+//   new_indices = [ 3, 1, 3 ]  # uncoalesced
+TORCH_API Tensor flatten_indices_by_dims(
+    const Tensor& indices,
+    const IntArrayRef& sizes,
+    const IntArrayRef& dims_to_flatten);
+
+// Find the CSR representation for a row `indices` from the COO format
+TORCH_API Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz);
+
+TORCH_API Tensor zeros_like_with_indices(const Tensor& t);
+
+template <size_t static_shape_max_len>
+class TensorGeometryHolder {
+  using geometry_holder_t = std::array<int64_t, static_shape_max_len>;
+
+ public:
+  explicit TensorGeometryHolder(
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options = {}) {
+    std::copy(sizes.begin(), sizes.end(), t_sizes.begin());
+    std::copy(strides.begin(), strides.end(), t_strides.begin());
+  }
+
+  explicit TensorGeometryHolder(const Tensor& t)
+      : TensorGeometryHolder(t.sizes(), t.strides()) {}
+
+  auto operator*() const {
+    return std::make_tuple(t_sizes, t_strides);
+  }
+
+ private:
+  geometry_holder_t t_sizes;
+  geometry_holder_t t_strides;
+};
+
+template <>
+class TensorGeometryHolder<0> {
+  using geometry_holder_t = Tensor;
+
+ public:
+  explicit TensorGeometryHolder(
+      IntArrayRef sizes,
+      IntArrayRef strides,
+      TensorOptions options) {
+    const int64_t t_ndims = sizes.size();
+    const auto cpu_options = TensorOptions(options).dtype(kLong).device(kCPU);
+    Tensor t_sizes_and_strides_cpu = at::empty({2, t_ndims}, cpu_options);
+    t_sizes_and_strides_cpu.select(0, 0).copy_(at::tensor(sizes, cpu_options));
+    t_sizes_and_strides_cpu.select(0, 1).copy_(
+        at::tensor(strides, cpu_options));
+    const Tensor t_sizes_and_strides =
+        t_sizes_and_strides_cpu.to(options.device());
+    t_sizes = t_sizes_and_strides.select(0, 0);
+    t_strides = t_sizes_and_strides.select(0, 1);
+  }
+
+  explicit TensorGeometryHolder(const Tensor& t)
+      : TensorGeometryHolder(t.sizes(), t.strides(), t.options()) {}
+
+  auto operator*() const {
+    return std::make_tuple(
+        t_sizes.template data_ptr<int64_t>(),
+        t_strides.template data_ptr<int64_t>());
+  }
+
+ private:
+  geometry_holder_t t_sizes;
+  geometry_holder_t t_strides;
+};
+
+// Return all indices of a tensor with the given shape.
+//
+// full_coo_indices(shape) is equivalent to
+// torch.ones(shape).nonzero().transpose(-2, -1) but much faster.
+TORCH_API Tensor full_coo_indices(IntArrayRef sizes, TensorOptions options);
+
+} // namespace at::sparse
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b02b4201ffaa175b58edeb8288b667b393dc07d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexing.h
@@ -0,0 +1,49 @@
+#pragma once
+
+// Indexing tensors by tensors
+
+#include <ATen/core/List.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/ReductionType.h>
+
+namespace at {
+struct TensorIterator;
+}
+
+namespace at::native {
+
+using index_put_with_sort_fn = void(*)(Tensor &, const c10::List<std::optional<Tensor>> &, const Tensor &, bool accumulate, bool unsafe);
+using index_put_with_sort_quantized_fn = void(*)(Tensor& self, const c10::List<std::optional<Tensor>>& indices, const Tensor& value, double scale, int zero_point, bool unsafe);
+using gather_fn = void (*)(const Tensor & result, const Tensor & self, int64_t dim, const Tensor & index);
+using scatter_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
+using scatter_fill_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Scalar& src);
+using scatter_add_fn = void(*)(const Tensor& self, int64_t dim, const Tensor& index, const Tensor& src);
+using scatter_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
+                                  const Tensor& src, const ReductionType& reduce);
+using scatter_scalar_reduce_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
+                                         const Scalar& value, const ReductionType& reduce);
+using scatter_reduce_two_fn = void(*)(const Tensor& self, const int64_t dim, const Tensor& index,
+                                      const Tensor& src, const ReductionType& reduce);
+
+DECLARE_DISPATCH(index_put_with_sort_fn, index_put_with_sort_stub);
+DECLARE_DISPATCH(index_put_with_sort_quantized_fn, index_put_with_sort_quantized_stub);
+DECLARE_DISPATCH(gather_fn, gather_stub);
+DECLARE_DISPATCH(scatter_fn, scatter_stub);
+DECLARE_DISPATCH(scatter_fill_fn, scatter_fill_stub);
+DECLARE_DISPATCH(scatter_add_fn, scatter_add_stub);
+DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub);
+DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub);
+DECLARE_DISPATCH(scatter_reduce_two_fn, scatter_reduce_two_stub);
+
+TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List<std::optional<at::Tensor>>& indices);
+
+using scatter_add_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&);
+using scatter_reduce_expanded_index_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const ReductionType& reduce, bool);
+using gather_expanded_index_fn = void (*)(const Tensor&, const Tensor&, const Tensor&);
+
+DECLARE_DISPATCH(scatter_add_expanded_index_fn, scatter_add_expanded_index_stub);
+DECLARE_DISPATCH(scatter_reduce_expanded_index_fn, scatter_reduce_expanded_index_stub);
+DECLARE_DISPATCH(gather_expanded_index_fn, gather_expanded_index_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9b616013ddb26e20bd26d33651e76863886e129
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorAdvancedIndexingUtils.h
@@ -0,0 +1,94 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/IndexingUtils.h>
+#include <ATen/native/TensorIterator.h>
+
+namespace at::native {
+namespace {
+#ifndef STRIP_ERROR_MESSAGES
+inline std::string shapes_as_str(TensorList tensors) {
+  std::ostringstream os;
+  bool first = true;
+  for (auto& tensor : tensors) {
+    if (tensor.defined()) {
+      if (!first) {
+        os << ", ";
+      }
+      os << tensor.sizes();
+      first = false;
+    }
+  }
+  return os.str();
+}
+#endif
+} // anonymous namespace
+
+inline std::tuple<bool, Tensor> canDispatchToMaskedFill(const Tensor& self, const torch::List<std::optional<at::Tensor>>& indices,
+const Tensor& value){
+  if (!(value.numel() ==1 && value.device().is_cpu())){
+    return std::make_tuple(false,Tensor());
+  }
+  int64_t num_ind = 0;
+  Tensor mask;
+  auto self_device = self.device();
+  for (const std::optional<Tensor>& i: indices) {
+    if (!i.has_value() || !(*i).defined()){
+      num_ind++;
+    } else {
+      const Tensor &index = *i;
+      if ((index.scalar_type() != kByte && index.scalar_type() != kBool) ||
+          index.device() != self_device || mask.defined()){
+        return std::make_tuple(false, Tensor());
+      } else {
+        mask = index;
+        for (const auto j : c10::irange(index.dim())) {
+          int64_t srcIdx = num_ind + j;
+          TORCH_CHECK_INDEX(index.size(j) == self.size(srcIdx), "The shape of the mask ", index.sizes(), " at index ", j,
+  " does not match the shape of the indexed tensor ", self.sizes(), " at index ", srcIdx);
+        }
+        num_ind += mask.ndimension();
+      }
+    }
+  }
+  for (C10_UNUSED const auto i : c10::irange(num_ind, self.ndimension())) {
+    mask = mask.unsqueeze(-1);
+  }
+  return std::make_tuple(true, mask);
+}
+
+inline AdvancedIndex make_info(Tensor self, IOptTensorListRef orig) {
+  checkIndexTensorTypes(orig, /*allow_int*/ true);
+  // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
+  auto indices = expandTensors(self, orig);
+  // next broadcast all index tensors together
+  try {
+    indices = expand_outplace(indices);
+  } catch (std::exception& e) {
+    TORCH_CHECK_INDEX(false, "shape mismatch: indexing tensors could not be broadcast together"
+                   " with shapes ", shapes_as_str(indices));
+  }
+  // add missing null Tensors so that it matches self.dim()
+  while (indices.size() < (size_t)self.dim()) {
+    indices.emplace_back();
+  }
+  // if the non-null indices are not all adjacent, transpose self and indices
+  // together so that they're adjacent at the front
+  if (!hasContiguousSubspace(indices)) {
+    std::tie(self, indices) = transposeToFront(self, indices);
+  }
+  // Ensure indices are on the same device as self
+  for (auto & indice : indices) {
+    if (indice.defined() && indice.device() != self.device()) {
+      indice = indice.to(self.device());
+    }
+  }
+  for (auto & indice : indices) {
+    if (indice.defined() && indice.dtype() == at::kInt) {
+      indice = indice.to(at::kLong);
+    }
+  }
+
+  return AdvancedIndex(self, indices);
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d52446446316af9546bf3a1f5c0eca58a6e608f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorDimApply.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+
+namespace at::native {
+//input tensors are non-zero dim and non-empty
+template<typename T1, typename T2, typename Function>
+
+void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int64_t dim, Function func) {
+  int ndims = self.dim();
+  int tensor_dim_apply_has_finished = 0;
+  std::vector<int64_t> counter(ndims, 0);
+  const T1* self_data = self.const_data_ptr<T1>();
+  T1* values_data = values.data_ptr<T1>();
+  T2* indices_data = indices.data_ptr<T2>();
+  int64_t self_stride = self.stride(dim);
+  int64_t values_stride = values.stride(dim);
+  int64_t indices_stride = indices.stride(dim);
+  int self_dim_size = self.size(dim);
+
+  while (!tensor_dim_apply_has_finished) {
+    func(self_data, values_data, indices_data, self_dim_size, self_stride, values_stride, indices_stride);
+    if (ndims == 1) {
+       break;
+    }
+    for (const auto dim_i : c10::irange(ndims)) {
+      if (dim_i == dim) {
+        if (dim_i == (ndims - 1)) {
+          tensor_dim_apply_has_finished = 1;
+          break;
+        }
+        continue;
+      }
+      counter[dim_i]++;
+      self_data += self.stride(dim_i);
+      values_data += values.stride(dim_i);
+      indices_data += indices.stride(dim_i);
+
+      if (counter[dim_i] == self.size(dim_i)) {
+        if (dim_i == ndims-1) {
+          tensor_dim_apply_has_finished = 1;
+          break;
+        } else {
+          self_data -= counter[dim_i]*self.stride(dim_i);
+          values_data -= counter[dim_i]*values.stride(dim_i);
+          indices_data -= counter[dim_i]*indices.stride(dim_i);
+          counter[dim_i] = 0;
+        }
+      } else {
+        break;
+     }
+    }
+  }
+}
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h
new file mode 100644
index 0000000000000000000000000000000000000000..32d0a1dc535615e9baf96f02a4afa7c79b6d397a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorFactories.h
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/EmptyTensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Dispatch_v2.h>
+#include <ATen/native/DispatchStub.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/scalar_tensor.h>
+#endif
+
+namespace at::native {
+// Different combinations of row, col, and offset can lead to two cases:
+//
+// Case 1 - Trapezoid (Triangle as a special case): row + offset <= col
+//    Example A: offset > 0
+//      1 1 0 0 0
+//      1 1 1 0 0
+//      1 1 1 1 0
+//    Example B: offset <= 0
+//      0 0 0
+//      1 0 0
+//      1 1 0
+//    In this case, we calculate the number of elements in the first row and
+//    last row of the tril respectively, and then compute the tril size.
+//
+// Case 2 - Trapezoid + Rectangle: row + offset > col
+//    Example:
+//      1 1 0
+//      1 1 1
+//      1 1 1
+//    In this case, we first calculate the size of top trapezoid, and then
+//    calculate the size of the bottom rectangle.
+inline int64_t get_tril_size(int64_t row, int64_t col, int64_t offset) {
+  // If either dimension is 0 then the there is no tril
+  if (row == 0 || col == 0) {
+    return 0;
+  }
+  // number of elements in the first row of the tril
+  auto m_first_row = offset > 0 ?
+    std::min<int64_t>(col, 1 + offset) : // upper bounded by col
+    row + offset > 0; // either 0 or 1
+  // number of elements in the last row of the tril, bounded by [0, col]
+  auto m_last_row = std::max<int64_t>(0, std::min<int64_t>(col, row + offset));
+  // number of rows, bounded by [0, row]
+  auto n_row_all = std::max<int64_t>(0, std::min<int64_t>(row, row + offset));
+  auto n_row_trapezoid = (m_last_row - m_first_row + 1);
+
+  // calculate # of elements in the top trapezoid
+  auto tril_size = (m_first_row + m_last_row) * n_row_trapezoid >> 1;
+
+  // calculate # of elements in the bottom rectangle if there is any
+  auto diff_row = n_row_all - n_row_trapezoid;
+  if (diff_row > 0) {
+    tril_size += diff_row * col;
+  }
+
+  return tril_size;
+}
+
+inline void check_args(
+    int64_t row, int64_t col, std::optional<Layout> layout_opt) {
+  TORCH_CHECK(row >= 0, "row must be non-negative, got", row);
+  TORCH_CHECK(col >= 0, "col must be non-negative, got", col);
+  if (layout_opt.has_value()) {
+    TORCH_CHECK(
+      *layout_opt == at::kStrided,
+      "only support layout=torch.strided, got",
+      *layout_opt)
+  }
+}
+
+using at::check_size_nonnegative;
+
+// assumes maximum value in created tensor is n-1 (e.g., torch.randperm(n))
+inline void check_supported_max_int_with_precision(int64_t n, const Tensor& tensor) {
+  // match defined() to behavior of checks below
+  TORCH_CHECK(at::scalar_tensor(n>0?n-1:n, tensor.options()).defined(),
+              "n is too large for result tensor type: '", tensor.toString(), "'");
+
+  // Ensure sufficient precision for floating point representation.
+  switch (tensor.scalar_type()) {
+    case at::ScalarType::Half:
+      TORCH_CHECK(n <= (int64_t(1) << 11) + 1, "n cannot be greater than 2049 for Half type.");
+      break;
+    case at::ScalarType::Float:
+      TORCH_CHECK(n <= (int64_t(1) << 24) + 1, "n cannot be greater than 2^24+1 for Float type.");
+      break;
+    case at::ScalarType::Double:  // Unlikely to happen, but doesn't hurt to check
+      TORCH_CHECK(n <= (int64_t(1) << 53) + 1, "n cannot be greater than 2^53+1 for Double type.");
+      break;
+    default:
+      break;
+  }
+}
+
+// Called by `empty*` functions when deterministic algorithms are enabled to
+// fill the tensor with NaN if it is floating point or complex type, or fill
+// with max value if it is integer type
+inline Tensor& fill_empty_deterministic_(Tensor& tensor) {
+  if (tensor.is_floating_point() || tensor.is_complex()) {
+    AT_DISPATCH_V2(
+      tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
+        tensor.fill_(std::numeric_limits<scalar_t>::quiet_NaN());
+    }), AT_EXPAND(AT_FLOATING_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), AT_EXPAND(AT_FLOAT8_TYPES), kBFloat16, kHalf);
+  } else {
+    AT_DISPATCH_V2(
+      tensor.scalar_type(), "fill_empty_deterministic_", AT_WRAP([&]() {
+        tensor.fill_(std::numeric_limits<scalar_t>::max());
+    }), kBool, AT_EXPAND(AT_INTEGRAL_TYPES_V2));
+  }
+  return tensor;
+}
+
+// The ZeroTensor allocator ignores whatever allocation is requested and always
+// gives you nullptr
+struct ZeroTensorAllocator final : public at::Allocator {
+  ZeroTensorAllocator(at::Device device) : device_(device) {};
+  ~ZeroTensorAllocator() override = default;
+  static void deleter(void* const pointer) {
+    TORCH_INTERNAL_ASSERT(!pointer);
+  }
+  DataPtr allocate(const size_t /*nbytes*/) override {
+    return {nullptr, nullptr, &deleter, device_};
+  }
+  DeleterFnPtr raw_deleter() const override {
+    return deleter;
+  }
+  void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const final {}
+  at::Device device_;
+};
+
+using binary_fn = void (*)(TensorIterator&);
+
+DECLARE_DISPATCH(binary_fn, complex_stub);
+DECLARE_DISPATCH(binary_fn, polar_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2bdd6eb13e4bde62cba1f9b5c65726abb5bb7d0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIteratorDynamicCasting.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <complex>
+#include <type_traits>
+#include <c10/core/ScalarType.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+
+
+// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h.
+
+// dynamic_casting handles when the types expected by the iterator do not match the types of the arguments
+// to the function that is being called.
+// On CUDA, the cast is currently pushed down into the kernel (for performance reasons).
+// On CPU, there is currently an internal assert that a dynamic_cast is not needed.
+
+namespace at::native {
+
+// `needs_dynamic_casting` compares the types expected by iterator
+// (i.e. dtypes of the operands) with the actual type of the arguments
+// (and returns) of func_t
+template<typename func_t, int nargs=function_traits<func_t>::arity>
+struct needs_dynamic_casting {
+  static bool check(TensorIteratorBase& iter) {
+    using traits = function_traits<func_t>;
+    using cpp_type = typename traits::template arg<nargs - 1>::type;
+    using cpp_map = c10::CppTypeToScalarType<cpp_type>;
+
+    if (iter.input_dtype(nargs-1) != cpp_map::value) {
+      return true;
+    }
+    return needs_dynamic_casting<func_t, nargs - 1>::check(iter);
+  }
+};
+
+template<typename func_t>
+struct needs_dynamic_casting<func_t, 0> {
+  static bool check(TensorIteratorBase& iter) {
+    using traits = function_traits<func_t>;
+    using cpp_type = typename traits::result_type;
+
+    // we could assert output numbers are correct here, but checks
+    // (including arity) are currently pushed outside of this struct.
+    if constexpr (std::is_void_v<cpp_type>) {
+      return false;
+    } else {
+      return iter.dtype(0) != c10::CppTypeToScalarType<cpp_type>::value;
+    }
+  }
+};
+
+} //namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h
new file mode 100644
index 0000000000000000000000000000000000000000..c35023d076e73d1c812b87b1b308286ebc6a6b07
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorShape.h
@@ -0,0 +1,105 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <ATen/core/IListRef.h>
+
+namespace at::native {
+
+TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self);
+
+inline bool cat_should_skip_tensor(const Tensor& t) {
+  return t.sym_numel() == 0 && t.dim() == 1;
+}
+
+ // Check to see if the shape of tensors is compatible
+ // for being concatenated along a given dimension.
+inline void check_cat_shape_except_dim(const Tensor & first, const Tensor & second, int64_t dimension, int64_t index) {
+   int64_t first_dims = first.dim();
+   int64_t second_dims = second.dim();
+   TORCH_CHECK(first_dims == second_dims, "Tensors must have same number of dimensions: got ",
+               first_dims, " and ", second_dims);
+   for (const auto dim : c10::irange(first_dims)) {
+     if (dim == dimension) {
+       continue;
+     }
+     int64_t first_dim_size = first.sizes()[dim];
+     int64_t second_dim_size = second.sizes()[dim];
+     TORCH_CHECK(first_dim_size == second_dim_size, "Sizes of tensors must match except in dimension ",
+                 dimension, ". Expected size ", static_cast<long long>(first_dim_size), " but got size ", static_cast<long long>(second_dim_size), " for tensor number ", index, " in the list.");
+   }
+ }
+
+inline void check_cat_no_zero_dim(const MaterializedITensorListRef& tensors) {
+  int64_t i = 0;
+  for(const Tensor& t : tensors) {
+    TORCH_CHECK(t.dim() > 0,
+             "zero-dimensional tensor (at position ", i, ") cannot be concatenated");
+    i++;
+  }
+}
+
+inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t dim) {
+  TORCH_CHECK(self.dim() != 0, "split expects at least a 1-dimensional tensor");
+  TORCH_CHECK(split_size >= 0,  "split expects split_size be non-negative, but got split_size=", split_size);
+  int64_t dim_size = self.size(dim);
+  TORCH_CHECK(split_size > 0 || dim_size == 0,
+           "split_size can only be 0 if dimension size is 0, "
+           "but got dimension size of ", dim_size);
+  // if split_size is 0 and dimension size is 0, there is 1 split.
+  int64_t num_splits = 1;
+  if (split_size != 0) {
+    // ensuring num_splits is at least 1 makes consistent the case where split_size > dim_size
+    // (returns a single split).  We might want to error here, but keep it for BC.
+    num_splits = std::max<int64_t>((dim_size + split_size - 1) / split_size, 1);
+  }
+  return num_splits;
+}
+
+inline bool have_same_ndims(TensorList tensors) {
+  auto ndim = tensors[0].dim();
+  for (const auto tensor_idx : c10::irange(tensors.size())) {
+    if(tensors[tensor_idx].dim() != ndim) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void leading_dimension_matches(TensorList tensors, int64_t dim) {
+  auto tensor_zero_size = tensors[0].sizes();
+  std::vector<c10::SymInt> leading_dim_sizes(tensor_zero_size.begin(), tensor_zero_size.begin() + dim);
+  for (const auto i : c10::irange(tensors.size())) {
+    at::Tensor tensor = tensors[i];
+    for(const auto j : c10::irange(dim)) {
+      TORCH_CHECK(
+        tensor.size(j) == leading_dim_sizes[j],
+        "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors"
+      );
+    }
+  }
+}
+
+inline int64_t preprocess_chunk_cat_inputs(TensorList tensors, int64_t dim, int64_t num_chunks) {
+  TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks");
+  TORCH_CHECK(!tensors.empty(),
+           "_chunk_cat expects a non-empty input tensor list");
+  auto expected_dtype = tensors[0].dtype();
+  auto expected_device = tensors[0].device();
+  for(const auto i : c10::irange(tensors.size())) {
+    TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor");
+    TORCH_CHECK(tensors[i].dtype() == expected_dtype, "_chunk_cat expects all input tensors with the same dtype");
+    TORCH_CHECK(tensors[i].device() == expected_device, "_chunk_cat expects all inputs tensors on the same device");
+  }
+  if (have_same_ndims(tensors)) {
+    dim = maybe_wrap_dim(dim, tensors[0].dim());
+  } else {
+    TORCH_CHECK(dim >= 0, "_chunk_cat expects non-negative dim when input tensors have different ndims")
+    for(const auto i : c10::irange(tensors.size())) {
+      TORCH_CHECK(dim < tensors[i].ndimension(), "_chunk_cat expects dim < ndim for all input tensors");
+    }
+  }
+  leading_dimension_matches(tensors, dim);
+  return dim;
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc56fa6457e75bc980747afc9d2d72257d6c093b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h
@@ -0,0 +1,57 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/LinearAlgebraUtils.h>
+
+namespace at::native {
+
+/*
+ * Given batches of matrices with arbitrary batch dim,
+ * computes the number of batches for Triu and Tril. This ignores stride 0 dimension
+ */
+static inline int64_t batchCountTrilTriu(const Tensor& batched_matrices) {
+  int64_t result = 1;
+  for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) {
+    if (batched_matrices.stride(i) != 0) {
+      result *= batched_matrices.size(i);
+    }
+  }
+  return result;
+}
+
+/* Checks a necessary property for the triu and tril implementations, hence the name.
+ * Here batch contiguity is checked for tensors with greater than 4 dimensions.
+ * Contiguous tensors and tensors with less than 3 dimensions pass this check
+ */
+static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor& tensor, bool allow_zero_stride) {
+  // Complete contiguity is the most desired property, which is why
+  // we return true if the tensor is contiguous
+  if (tensor.is_contiguous()) {
+    auto default_strides_for_size = batched_matrix_contiguous_strides(tensor.sizes());
+    if (tensor.strides() == default_strides_for_size) {
+      return std::make_tuple(true, tensor);
+    } else {
+      return std::make_tuple(false, tensor.as_strided(tensor.sizes(), default_strides_for_size));
+    }
+  }
+
+  int64_t dims = tensor.dim();
+
+  // Tensors with dimension less than 4 are handled by default
+  if (allow_zero_stride && dims <= 3) {
+    return std::make_tuple(true, tensor);
+  }
+
+  int64_t expected_stride = tensor.size(-1) * tensor.size(-2);
+  for (int64_t i = dims - 3; i >= 0; i--) {
+    // Skip trivial dimension;
+    if (allow_zero_stride && i == 0 && (tensor.stride(i) == 0 || tensor.size(i) == 1)) {
+      continue;
+    }
+    if (expected_stride != tensor.stride(i)) {
+      return std::make_tuple(false, tensor.contiguous());
+    }
+    expected_stride *= tensor.size(i);
+  }
+  return std::make_tuple(true, tensor);
+}
+
+}  // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnaryOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnaryOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d99fdc40d048b208be65b4b44d6f0e59c9e1a62
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnaryOps.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <ATen/Generator.h>
+#include <c10/core/Scalar.h>
+#include <stdexcept>
+
+namespace at {
+class Tensor;
+class TensorBase;
+struct TensorIteratorBase;
+}
+
+namespace at::native {
+
+using unary_fn = void(*)(TensorIteratorBase&);
+using unary_fn_with_scalar = void(*)(TensorIteratorBase&, const Scalar& a);
+
+inline namespace CPU_CAPABILITY {
+void conj_kernel(TensorIteratorBase &iter);
+void neg_kernel(TensorIteratorBase &iter);
+void reciprocal_kernel(TensorIteratorBase &iter);
+void rsqrt_kernel(TensorIteratorBase& iter);
+void sqrt_kernel(TensorIteratorBase& iter);
+} // namespace CPU_CAPABILITY
+
+DECLARE_DISPATCH(unary_fn, abs_stub);
+DECLARE_DISPATCH(unary_fn, angle_stub);
+DECLARE_DISPATCH(unary_fn, conj_physical_stub);
+DECLARE_DISPATCH(unary_fn, acos_stub);
+DECLARE_DISPATCH(unary_fn, acosh_stub);
+DECLARE_DISPATCH(unary_fn, asinh_stub);
+DECLARE_DISPATCH(unary_fn, atanh_stub);
+DECLARE_DISPATCH(unary_fn, asin_stub);
+DECLARE_DISPATCH(unary_fn, atan_stub);
+DECLARE_DISPATCH(unary_fn, bitwise_not_stub);
+DECLARE_DISPATCH(unary_fn, logical_not_stub);
+DECLARE_DISPATCH(unary_fn, ceil_stub);
+DECLARE_DISPATCH(unary_fn, cos_stub);
+DECLARE_DISPATCH(unary_fn, cosh_stub);
+DECLARE_DISPATCH(unary_fn, digamma_stub);
+DECLARE_DISPATCH(unary_fn, special_entr_stub);
+DECLARE_DISPATCH(unary_fn, special_erfcx_stub);
+DECLARE_DISPATCH(unary_fn, erf_stub);
+DECLARE_DISPATCH(unary_fn, erfc_stub);
+DECLARE_DISPATCH(unary_fn, erfinv_stub);
+DECLARE_DISPATCH(unary_fn, exp_stub);
+DECLARE_DISPATCH(unary_fn, exp2_stub);
+DECLARE_DISPATCH(unary_fn, expm1_stub);
+DECLARE_DISPATCH(unary_fn, floor_stub);
+DECLARE_DISPATCH(unary_fn, frac_stub);
+DECLARE_DISPATCH(unary_fn, frexp_stub);
+DECLARE_DISPATCH(unary_fn, i0_stub);
+DECLARE_DISPATCH(unary_fn, special_i0e_stub);
+DECLARE_DISPATCH(unary_fn, special_i1_stub);
+DECLARE_DISPATCH(unary_fn, special_i1e_stub);
+DECLARE_DISPATCH(unary_fn, log_stub);
+DECLARE_DISPATCH(unary_fn, log10_stub);
+DECLARE_DISPATCH(unary_fn, log1p_stub);
+DECLARE_DISPATCH(unary_fn, log2_stub);
+DECLARE_DISPATCH(unary_fn, special_ndtri_stub);
+DECLARE_DISPATCH(unary_fn, special_log_ndtr_stub);
+DECLARE_DISPATCH(unary_fn, neg_stub);
+
+DECLARE_DISPATCH(unary_fn, reciprocal_stub);
+DECLARE_DISPATCH(unary_fn, round_stub);
+DECLARE_DISPATCH(unary_fn, rsqrt_stub);
+DECLARE_DISPATCH(unary_fn, sigmoid_stub);
+DECLARE_DISPATCH(unary_fn_with_scalar, logit_stub);
+DECLARE_DISPATCH(unary_fn, sign_stub);
+DECLARE_DISPATCH(unary_fn, signbit_stub);
+DECLARE_DISPATCH(unary_fn, sgn_stub);
+DECLARE_DISPATCH(unary_fn, sin_stub);
+DECLARE_DISPATCH(unary_fn, sinc_stub);
+DECLARE_DISPATCH(unary_fn, sinh_stub);
+DECLARE_DISPATCH(unary_fn, sqrt_stub);
+DECLARE_DISPATCH(unary_fn, tan_stub);
+DECLARE_DISPATCH(unary_fn, tanh_stub);
+DECLARE_DISPATCH(unary_fn, trigamma_stub);
+DECLARE_DISPATCH(unary_fn, trunc_stub);
+DECLARE_DISPATCH(unary_fn, lgamma_stub);
+DECLARE_DISPATCH(unary_fn, special_airy_ai_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_j0_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_j1_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_y0_stub);
+DECLARE_DISPATCH(unary_fn, special_bessel_y1_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_i0_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_i1_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_k0_stub);
+DECLARE_DISPATCH(unary_fn, special_modified_bessel_k1_stub);
+DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k0_stub);
+DECLARE_DISPATCH(unary_fn, special_scaled_modified_bessel_k1_stub);
+DECLARE_DISPATCH(unary_fn, special_spherical_bessel_j0_stub);
+
+// NB: these are actually defined in Distribution
+DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, std::optional<Generator>), bernoulli_tensor_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, std::optional<Generator>), bernoulli_scalar_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional<Generator>), cauchy_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional<Generator>), exponential_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, std::optional<Generator>), geometric_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional<Generator>), log_normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, std::optional<Generator>), uniform_stub);
+DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, std::optional<Generator>), normal_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, std::optional<Generator>), random_from_to_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional<Generator>), random_full_64_bits_range_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, std::optional<Generator>), random_stub);
+
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t, const double), kaiser_window_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const int64_t), polygamma_stub);
+DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const Scalar& a, const Scalar& b), clamp_stub);
+DECLARE_DISPATCH(
+    void (*)(Tensor&, const Tensor&, int64_t, std::optional<Generator>),
+    multinomial_with_replacement_stub);
+DECLARE_DISPATCH(
+    void (*)(
+        TensorIteratorBase&,
+        std::optional<double>,
+        std::optional<double>,
+        std::optional<double>),
+    nan_to_num_stub);
+DECLARE_DISPATCH(void (*)(TensorIteratorBase&, int64_t), round_decimals_stub);
+
+// Missing unary functions
+// digamma
+// lgamma
+// erfinv
+// clone
+// contiguous
+// zero
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold2d.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5fe7d4468217b16818f4bcfc06fb0481175be38
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold2d.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/ScalarType.h>
+#include <cstdint>
+
+namespace at::native {
+
+using unfold2d_copy_fn = void (*)(
+    ScalarType dtype,
+    void *finput,
+    const void *input,
+    int64_t kH,
+    int64_t kW,
+    int64_t dH,
+    int64_t dW,
+    int64_t padH,
+    int64_t padW,
+    int64_t n_input_plane,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width,
+    bool is_channels_last
+);
+
+using unfold2d_acc_fn = void (*)(
+    ScalarType dtype,
+    void *finput,
+    void *input,
+    int64_t kH,
+    int64_t kW,
+    int64_t dH,
+    int64_t dW,
+    int64_t padH,
+    int64_t padW,
+    int64_t n_input_plane,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width,
+    bool is_channels_last
+);
+
+DECLARE_DISPATCH(unfold2d_copy_fn, unfolded2d_copy_stub);
+DECLARE_DISPATCH(unfold2d_acc_fn, unfolded2d_acc_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..90ead9d1f7ad48187b8c0a28af2fd59915887d17
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Unfold3d.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+
+namespace at::native {
+
+void Unfold3dCopyCPU(
+    ScalarType dtype,
+    const void *src,
+    int64_t C,
+    int64_t X_D,
+    int64_t X_H,
+    int64_t X_W,
+    int64_t Y_D,
+    int64_t Y_H,
+    int64_t Y_W,
+    int64_t kernel_d,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t stride_d,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_d,
+    int64_t pad_h,
+    int64_t pad_w,
+    void* dst);
+
+void Unfold3dAccCPU(
+    ScalarType dtype,
+    const void *src,
+    int64_t C,
+    int64_t X_D,
+    int64_t X_H,
+    int64_t X_W,
+    int64_t Y_D,
+    int64_t Y_H,
+    int64_t Y_W,
+    int64_t kernel_d,
+    int64_t kernel_h,
+    int64_t kernel_w,
+    int64_t stride_d,
+    int64_t stride_h,
+    int64_t stride_w,
+    int64_t pad_d,
+    int64_t pad_h,
+    int64_t pad_w,
+    void *dst);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h
new file mode 100644
index 0000000000000000000000000000000000000000..44e05c125913ec234247938a25b47e5feeb9f183
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UnfoldBackward.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/NonEmptyUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/arange.h>
+#endif
+
+namespace at::native {
+
+using unfold_backward_fn = void (*)(
+  Tensor& grad_in,
+  const Tensor& grad,
+  int64_t dim,
+  int64_t size,
+  int64_t step
+);
+
+DECLARE_DISPATCH(unfold_backward_fn, unfold_backward_stub);
+
+namespace {
+
+// Note on naming: it is unconventional.
+// grad_in does not mean that it is a gradient wrt to input,
+// grad_in/grad_out is just an input/output of unfold_backward kernel.
+
+static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_out(
+  Tensor& grad_out,
+  const Tensor& grad_in,
+  int64_t dim,
+  int64_t size,
+  int64_t step
+) {
+  dim = maybe_wrap_dim(dim, grad_out.dim());
+  // last dim stores the folds
+
+  auto grad_out_dim_size = ensure_nonempty_size(grad_out, dim);
+  auto grad_in_dim_size = ensure_nonempty_size(grad_in, dim);
+  // dictates the number of elements to iterate over
+  // in dimension `dim`
+  auto iter_dim_size = std::min(
+    grad_out_dim_size,
+    (grad_in_dim_size - 1) * step + size
+  );
+
+  /* prepare grad_out for TensorIterator { */
+  auto grad_out_strides = ensure_nonempty_vec(grad_out.strides().vec());
+  auto grad_out_sizes = ensure_nonempty_vec(grad_out.sizes().vec());
+  grad_out_sizes[dim] = iter_dim_size;
+  auto grad_out_restrided = grad_out.as_strided(
+    grad_out_sizes, grad_out_strides
+  );
+  /* } */
+
+  /* prepare grad_in for TensorIterator { */
+  auto grad_in_strides = ensure_nonempty_vec(grad_in.strides().vec());
+  auto grad_in_sizes = ensure_nonempty_vec(grad_in.sizes().vec());
+
+  // set strides for dim to 0
+  // and size to 1 because
+  // this dimension is indexed inside the kernel
+  grad_in_strides[dim] = 0;
+  grad_in_sizes[dim] = 1;
+
+  grad_in_strides.pop_back();
+  grad_in_sizes.pop_back();
+
+  auto grad_in_restrided = grad_in.squeeze(-1).as_strided(
+    grad_in_sizes, grad_in_strides
+  );
+  /* } */
+
+  // During the TensorIterator iteration we have to know
+  // i_dim in grad_out[i_1,...,i_dim,...i_n],
+  // idx_dim stores this information
+  /* prepare idx_dim for TensorIterator { */
+  auto idx_dim = at::arange(
+    0, iter_dim_size, grad_in.options().dtype(at::kLong)
+  );
+
+  auto grad_out_dim = ensure_nonempty_dim(grad_out.dim());
+
+  auto idx_dim_strides = std::vector<int64_t>(grad_out_dim, 0);
+  auto idx_dim_sizes = std::vector<int64_t>(grad_out_dim, 1);
+
+  idx_dim_strides[dim] = 1;
+  idx_dim_sizes[dim] = iter_dim_size;
+
+  // idx_dim size will broadcast over determined by grad_out sizes in TensorIterator
+  auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides);
+  /* } */
+
+  auto iter = TensorIteratorConfig()
+    .set_check_mem_overlap(false)
+    .check_all_same_dtype(false)
+    .resize_outputs(false)
+    .add_owned_output(grad_out_restrided)
+    .add_owned_const_input(grad_in_restrided)
+    .add_owned_const_input(idx_dim_restrided)
+    .build();
+
+  return iter;
+}
+
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h
new file mode 100644
index 0000000000000000000000000000000000000000..033ef2b7fad3ec39a2e9f9ec6ae176b94903bb34
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/UpSample.h
@@ -0,0 +1,505 @@
+#pragma once
+
+#include <math.h>
+
+#include <ATen/OpMathType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/cpu/utils.h>
+
+/**
+ * Note [compute_scales_value]
+ * Note [area_pixel_compute_scale]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Interpolate with scale_factor can have different behaviors
+ * depending on the value of recompute_scale_factor:
+ *
+ * - With recompute_scale_factor = True (current default behavior):
+ * the scale_factor, when provided by the user, are used to calculate
+ * the output size. The input size and the computed output_size
+ * are then used to infer new values for the scales which are
+ * used in the interpolation.  Because floating-point math is not exact,
+ * this may be a different value from the user-supplied scales.
+ *
+ * - With recompute_scale_factor = False (which will be the default
+ * behavior starting 1.5.0):
+ * the behavior follows opencv logic, and the scales provided by
+ * the user are the ones used in the interpolation calculations.
+ *
+ * If the scales are not provided or if they are provided but
+ * recompute_scale_factor is set to True (default behavior), the scales
+ * are computed from the input and the output size;
+ *
+ *
+ * When the scales are inferred from the input and output sizes,
+ * we view each pixel as an area, idx + 0.5 as its center index.
+ * Here is an example formula in 1D case.
+ * if align_corners: center of two corner pixel areas are preserved,
+ *     (0.5, 0.5) -> (0.5, 0.5),
+ *     (input_size - 0.5, 0.5) -> (output_size - 0.5)
+ *     scale = (input_size - 0.5 - 0.5) / (output_size - 0.5 - 0.5)
+ *     src_index + 0.5 - 0.5 = scale * (dst_index + 0.5 - 0.5)
+ * if not align_corners: the whole range is scaled accordingly
+ *     scale = input_size / output_size
+ *     src_idx + 0.5 = scale * (dst_index + 0.5)
+ */
+
+namespace at::native {
+
+namespace upsample {
+
+TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
+    c10::IntArrayRef input_size,  // Full input tensor size.
+    at::OptionalIntArrayRef output_size,
+    std::optional<c10::ArrayRef<double>> scale_factors);
+
+inline std::optional<double> get_scale_value(std::optional<c10::ArrayRef<double>> scales, int idx) {
+  if (!scales) {
+    return std::nullopt;
+  }
+  return scales->at(idx);
+}
+
+} // namespace upsample
+
+using scale_t = std::optional<double>;
+using upsampling_nearest1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w);
+using _upsampling_nearest_exact1d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_w);
+using upsampling_nearest2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w);
+using _upsampling_nearest_exact2d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_h, scale_t scales_w);
+using upsampling_nearest3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+using _upsampling_nearest_exact3d = void(*)(const Tensor& output, const Tensor& input, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+using upsampling_linear1d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_w);
+using upsampling_bilinear2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+using _upsampling_bilinear2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+using upsampling_trilinear3d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_d, scale_t scales_h, scale_t scales_w);
+using upsampling_bicubic2d = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+using _upsampling_bicubic2d_aa = void(*)(const Tensor& output, const Tensor& input, bool align_corners, scale_t scales_h, scale_t scales_w);
+DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_kernel);
+DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_kernel);
+DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_kernel);
+DECLARE_DISPATCH(upsampling_nearest1d, upsample_nearest1d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact1d, _upsample_nearest_exact1d_backward_kernel);
+DECLARE_DISPATCH(upsampling_nearest2d, upsample_nearest2d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact2d, _upsample_nearest_exact2d_backward_kernel);
+DECLARE_DISPATCH(upsampling_nearest3d, upsample_nearest3d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_nearest_exact3d, _upsample_nearest_exact3d_backward_kernel);
+DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_kernel);
+DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_kernel);
+DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_kernel);
+DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_kernel);
+DECLARE_DISPATCH(upsampling_linear1d, upsample_linear1d_backward_kernel);
+DECLARE_DISPATCH(upsampling_bilinear2d, upsample_bilinear2d_backward_kernel);
+DECLARE_DISPATCH(_upsampling_bilinear2d_aa, _upsample_bilinear2d_aa_backward_kernel);
+DECLARE_DISPATCH(upsampling_trilinear3d, upsample_trilinear3d_backward_kernel);
+DECLARE_DISPATCH(upsampling_bicubic2d, upsample_bicubic2d_kernel);
+DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_kernel);
+DECLARE_DISPATCH(_upsampling_bicubic2d_aa, _upsample_bicubic2d_aa_backward_kernel);
+
+inline C10_UNUSED std::array<int64_t, 3> upsample_1d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+  TORCH_CHECK(
+      output_size.size() == 1,
+      "It is expected output_size equals to 1, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 3,
+      "It is expected input_size equals to 3, but got size ",
+      input_size.size());
+
+  int64_t output_width = output_size[0];
+
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_width = input_size[2];
+
+  TORCH_CHECK(
+      input_width > 0 && output_width > 0,
+      "Input and output sizes should be greater than 0, but got input (W: ",
+      input_width,
+      ") and output (W: ",
+      output_width,
+      ")");
+
+  return {nbatch, channels, output_width};
+}
+
+inline C10_UNUSED std::array<int64_t, 4> upsample_2d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+  TORCH_CHECK(
+      output_size.size() == 2,
+      "It is expected output_size equals to 2, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 4,
+      "It is expected input_size equals to 4, but got size ",
+      input_size.size());
+
+  int64_t output_height = output_size[0];
+  int64_t output_width = output_size[1];
+
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_height = input_size[2];
+  int64_t input_width = input_size[3];
+
+  TORCH_CHECK(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+          output_width > 0,
+      "Input and output sizes should be greater than 0,"
+      " but got input (H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+
+  return {nbatch, channels, output_height, output_width};
+}
+
+inline C10_UNUSED
+std::array<int64_t, 5> upsample_3d_common_check(IntArrayRef input_size, IntArrayRef output_size) {
+  TORCH_CHECK(
+      output_size.size() == 3,
+      "It is expected output_size equals to 3, but got size ",
+      output_size.size());
+
+  TORCH_CHECK(
+      input_size.size() == 5,
+      "It is expected input_size equals to 5, but got size ",
+      input_size.size());
+
+  int64_t output_depth = output_size[0];
+  int64_t output_height = output_size[1];
+  int64_t output_width = output_size[2];
+
+  int64_t nbatch = input_size[0];
+  int64_t channels = input_size[1];
+  int64_t input_depth = input_size[2];
+  int64_t input_height = input_size[3];
+  int64_t input_width = input_size[4];
+
+  TORCH_CHECK(
+      input_depth > 0 && input_height > 0 && input_width > 0 &&
+          output_depth > 0 && output_height > 0 && output_width > 0,
+      "Input and output sizes should be greater than 0, but got input (D: ",
+      input_depth,
+      ", H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (D: ",
+      output_depth,
+      ", H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+
+
+  return {nbatch, channels, output_depth, output_height, output_width};
+}
+
+inline void upsample_2d_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int64_t nbatch,
+    int64_t nchannels,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width) {
+  TORCH_CHECK(
+      input_height > 0 && input_width > 0 && output_height > 0 &&
+          output_width > 0,
+      "Input and output sizes should be greater than 0,"
+      " but got input (H: ",
+      input_height,
+      ", W: ",
+      input_width,
+      ") output (H: ",
+      output_height,
+      ", W: ",
+      output_width,
+      ")");
+
+  if (input.defined()) {
+    // Allow for empty batch size but not other dimensions
+    TORCH_CHECK(
+                (input.numel() != 0 ||
+                 (input.size(1) != 0 && input.size(2) != 0 && input.size(3) != 0)
+                 ) &&
+                input.dim() == 4,
+                "Non-empty 4D data tensor expected but got a tensor with sizes ",
+                input.sizes());
+  } else if (grad_output.defined()) {
+    check_dim_size(grad_output, 4, 0, nbatch);
+    check_dim_size(grad_output, 4, 1, nchannels);
+    check_dim_size(grad_output, 4, 2, output_height);
+    check_dim_size(grad_output, 4, 3, output_width);
+  }
+}
+
+template <typename scalar_t>
+inline scalar_t compute_scales_value(
+    const std::optional<double> scale,
+    int64_t input_size,
+    int64_t output_size) {
+      // see Note [compute_scales_value]
+      // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults.
+      return (scale.has_value() && scale.value() > 0.)
+          ? static_cast<scalar_t>(1.0 / scale.value())
+          : (static_cast<scalar_t>(input_size) / output_size);
+}
+
+template <typename scalar_t>
+inline scalar_t area_pixel_compute_scale(
+    int64_t input_size,
+    int64_t output_size,
+    bool align_corners,
+    const std::optional<double> scale) {
+  // see Note [area_pixel_compute_scale]
+  if(align_corners) {
+    if(output_size > 1) {
+      return static_cast<scalar_t>(input_size - 1) / (output_size - 1);
+    } else {
+      return static_cast<scalar_t>(0);
+    }
+  } else {
+    return compute_scales_value<scalar_t>(scale, input_size, output_size);
+  }
+}
+
+template <typename scalar_t>
+inline scalar_t area_pixel_compute_source_index(
+    scalar_t scale,
+    int64_t dst_index,
+    bool align_corners,
+    bool cubic) {
+  if (align_corners) {
+    return scale * dst_index;
+  } else {
+    scalar_t src_idx = scale * (dst_index + static_cast<scalar_t>(0.5)) -
+        static_cast<scalar_t>(0.5);
+    // [Note] Follow Opencv resize logic:
+    // We allow negative src_idx here and later will use
+    //   dx = src_idx - floorf(src_idx)
+    // to compute the "distance"(which affects weights).
+    // For linear modes, weight distribution doesn't matter
+    // for negative indices as they use 2 pixels to interpolate.
+    // For example, [-1, 0], they both use pixel 0 value so it
+    // doesn't affect if we bound the src_idx to 0 or not.
+    // TODO: Our current linear mode impls use unbound indices
+    // where we should and then remove this cubic flag.
+    // This matters in cubic mode, as we might need [-1, 0, 1, 2]
+    // to interpolate and the weights can be affected.
+    return (!cubic && src_idx < static_cast<scalar_t>(0)) ? scalar_t(0)
+                                                          : src_idx;
+  }
+}
+
+inline int64_t nearest_neighbor_compute_source_index(
+    const float scale,
+    int64_t dst_index,
+    int64_t input_size) {
+  // Index computation matching OpenCV INTER_NEAREST
+  // which is buggy and kept for BC
+  const int64_t src_index =
+      std::min(static_cast<int64_t>(floorf(dst_index * scale)), input_size - 1);
+  return src_index;
+}
+
+inline int64_t nearest_neighbor_exact_compute_source_index(
+    const float scale,
+    int64_t dst_index,
+    int64_t input_size) {
+  // index_f32 = (output_index + 0.5) * scale - 0.5
+  // input_index = round(index_f32)
+  // Same as Pillow and Scikit-Image/Scipy ndi.zoom
+  const int64_t src_index =
+      std::min(static_cast<int64_t>(floorf((dst_index + 0.5) * scale)), input_size - 1);
+  return src_index;
+}
+
+inline int64_t nearest_idx(
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    std::optional<double> scales) {
+  // This method specificly treats cases: output_size == input_size or
+  // output_size == 2 * input_size, that we would like to get rid of
+  // We keep this method for BC and consider as deprecated.
+  // See nearest_exact_idx as replacement
+  if (output_size == input_size) {
+    // scale_factor = 1, simply copy
+    return output_index;
+  } else if (output_size == 2 * input_size) {
+    // scale_factor = 2, shift input index
+    return output_index >> 1;
+  } else {
+    float scale = compute_scales_value<float>(scales, input_size, output_size);
+    return nearest_neighbor_compute_source_index(scale, output_index, input_size);
+  }
+}
+
+inline int64_t nearest_exact_idx(
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    std::optional<double> scales) {
+  float scale = compute_scales_value<float>(scales, input_size, output_size);
+    return nearest_neighbor_exact_compute_source_index(scale, output_index, input_size);
+}
+
+// Define a typedef to dispatch to nearest_idx or nearest_exact_idx
+typedef int64_t (*nearest_idx_fn_t)(int64_t, int64_t, int64_t, std::optional<double>);
+
+template <typename scalar_t>
+scalar_t upsample_get_value_bounded(
+    scalar_t* data,
+    int64_t width,
+    int64_t height,
+    int64_t x,
+    int64_t y) {
+  int64_t access_x = std::max(std::min(x, width - 1), static_cast<int64_t>(0));
+  int64_t access_y = std::max(std::min(y, height - 1), static_cast<int64_t>(0));
+  return data[access_y * width + access_x];
+}
+
+template <typename scalar_t>
+void upsample_increment_value_bounded(
+    scalar_t* data,
+    int64_t width,
+    int64_t height,
+    int64_t x,
+    int64_t y,
+    scalar_t value) {
+  int64_t access_x = std::max(std::min(x, width - 1), static_cast<int64_t>(0));
+  int64_t access_y = std::max(std::min(y, height - 1), static_cast<int64_t>(0));
+  data[access_y * width + access_x] += value;
+}
+
+// Based on
+// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
+template <typename scalar_t>
+scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename scalar_t>
+scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename scalar_t>
+void get_cubic_upsample_coefficients(
+    scalar_t coeffs[4],
+    scalar_t t) {
+  scalar_t A = -0.75;
+
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t>
+inline scalar_t cubic_interp1d(
+    scalar_t x0,
+    scalar_t x1,
+    scalar_t x2,
+    scalar_t x3,
+    scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+// when `real_input_index` becomes larger than the range the floating point
+// type can accurately represent, the type casting to `int64_t` might exceed
+// `input_size`, causing overflow. So we guard it with `std::min` below.
+template<typename scalar_t, typename opmath_t>
+inline void guard_index_and_lambda(const opmath_t& real_input_index, const int64_t& input_size, int64_t& input_index, scalar_t& lambda) {
+  input_index = std::min(static_cast<int64_t>(floorf(real_input_index)), input_size - 1);
+  lambda = std::min(
+      std::max(real_input_index - input_index, static_cast<opmath_t>(0)),
+      static_cast<opmath_t>(1)
+    );
+}
+
+template<typename scalar_t, typename opmath_t>
+inline void compute_source_index_and_lambda(
+    int64_t& input_index0,
+    int64_t& input_index1,
+    scalar_t& lambda0,
+    scalar_t& lambda1,
+    opmath_t ratio,
+    int64_t output_index,
+    int64_t input_size,
+    int64_t output_size,
+    bool align_corners) {
+  if (output_size == input_size) {
+    // scale_factor = 1, simply copy
+    input_index0 = output_index;
+    input_index1 = output_index;
+    lambda0 = static_cast<scalar_t>(1);
+    lambda1 = static_cast<scalar_t>(0);
+  } else {
+    const auto real_input_index =
+        area_pixel_compute_source_index<opmath_t>(
+            ratio, output_index, align_corners, /*cubic=*/false);
+    guard_index_and_lambda(real_input_index, input_size, input_index0, lambda1);
+    int64_t offset = (input_index0 < input_size - 1) ? 1 : 0;
+    input_index1 = input_index0 + offset;
+    lambda0 = static_cast<scalar_t>(1.) - lambda1;
+  }
+}
+
+// It will not be used by data types other than BFloat16 and Half.
+template <typename scalar_in, typename scalar_out,
+          typename std::enable_if_t<!is_reduced_floating_point_v<scalar_out> || !std::is_same<scalar_in, float>::value, int> = 0>
+void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
+  TORCH_CHECK((is_reduced_floating_point_v<scalar_out>),
+              "Upsample backward only support BFloat16 and Half in the lower precision data types on CPU.")
+  TORCH_CHECK((std::is_same<scalar_in, float>::value),
+              "Upsample backward should use float as acc buffer for BFloat16 and Half grad input on CPU.")
+  return;
+}
+
+template <typename scalar_in, typename scalar_out,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_out> && std::is_same<scalar_in, float>::value, int> = 0>
+void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) {
+  using bVec = Vectorized<scalar_out>;
+  using fVec = Vectorized<float>;
+  int64_t d = 0;
+  for (; d < size - (size % bVec::size()); d += bVec::size()) {
+    bVec gin_bvec = bVec::loadu(gin + d);
+    auto [gin_fvec0, gin_fvec1] = convert_to_float<scalar_out>(gin_bvec);
+    gin_fvec0 += fVec::loadu(buffer_ptr + d);
+    gin_fvec1 += fVec::loadu(buffer_ptr + d + fVec::size());
+    fVec(0).store(buffer_ptr + d);
+    fVec(0).store(buffer_ptr + d + fVec::size());
+    convert_from_float<scalar_out>(gin_fvec0, gin_fvec1).store(gin + d);
+  }
+  for (; d < size; d++) {
+    gin[d] += buffer_ptr[d];
+    buffer_ptr[d] = 0;
+  }
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/batch_norm.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/batch_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..eba4b0a963241d8ed97eda561d1b3838b961716f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/batch_norm.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at::native {
+
+using batch_norm_fn = void (*)(Tensor&, const Tensor&, const Tensor&,
+    const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, bool, double);
+using batch_norm_collect_stats_fn = void (*)(Tensor&, Tensor&, const Tensor&);
+using batch_norm_backward_fn = void(*)(Tensor&, Tensor&, Tensor&, const Tensor&,
+        const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, bool, double);
+
+DECLARE_DISPATCH(batch_norm_fn, batch_norm_cpu_stub);
+DECLARE_DISPATCH(batch_norm_collect_stats_fn, batch_norm_cpu_collect_stats_stub);
+DECLARE_DISPATCH(batch_norm_backward_fn, batch_norm_cpu_backward_stub);
+
+// TensorAccessor when it is defined to work around undefined...
+template <typename scalar_t>
+static TensorAccessor<scalar_t, 1> conditional_accessor_1d(const Tensor& t) {
+  if (! t.defined()) {
+    return TensorAccessor<scalar_t, 1>(nullptr, nullptr, nullptr);
+  }
+  return t.accessor<scalar_t, 1>();
+}
+
+template <typename scalar_t>
+static scalar_t* conditional_data_ptr(const Tensor& t) {
+  if constexpr (std::is_const_v<scalar_t>) {
+    return t.defined() ? t.contiguous().const_data_ptr<scalar_t>()
+                      : nullptr;
+  } else {
+    return t.defined() ? t.contiguous().data_ptr<scalar_t>()
+                      : nullptr;
+  }
+}
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bcd57027d51725aa3f315b9c35b9db126fc76c9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h
@@ -0,0 +1,494 @@
+#include <ATen/Config.h>
+#include <ATen/core/DimVector.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/native/cuda/CuFFTUtils.h>
+#include <ATen/native/utils/ParamsHash.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
+
+#include <cufft.h>
+#include <cufftXt.h>
+
+#include <limits>
+#include <list>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+
+namespace at { namespace native { namespace detail {
+
+// Enum representing the FFT type
+enum class CuFFTTransformType : int8_t {
+  C2C,  // Complex-to-complex
+  R2C,  // Real-to-complex
+  C2R,  // Complex-to-real
+};
+
+// This struct is used to let us easily compute hashes of the
+// parameters.
+// It will be the **key** to the plan cache.
+struct CuFFTParams
+{
+  int64_t signal_ndim_; // between 1 and max_rank, i.e., 1 <= signal_ndim <= 3
+  // These include additional batch dimension as well.
+  int64_t sizes_[max_rank + 1];
+  int64_t input_strides_[max_rank + 1];
+  int64_t output_strides_[max_rank + 1];
+  CuFFTTransformType fft_type_;
+  ScalarType value_type_;
+
+  CuFFTParams() = default;
+
+  CuFFTParams(IntArrayRef in_strides, IntArrayRef out_strides,
+      IntArrayRef signal_sizes, CuFFTTransformType fft_type, ScalarType value_type) {
+    // Padding bits must be zeroed for hashing
+    memset(this, 0, sizeof(*this));
+    signal_ndim_ = signal_sizes.size() - 1;
+    fft_type_ = fft_type;
+    value_type_ = value_type;
+
+    TORCH_INTERNAL_ASSERT(in_strides.size() == signal_sizes.size());
+    TORCH_INTERNAL_ASSERT(out_strides.size() == signal_sizes.size());
+    TORCH_INTERNAL_ASSERT(1 <= signal_ndim_ && signal_ndim_ <= max_rank);
+
+    std::copy(signal_sizes.cbegin(), signal_sizes.cend(), sizes_);
+    std::copy(in_strides.cbegin(), in_strides.cend(), input_strides_);
+    std::copy(out_strides.cbegin(), out_strides.cend(), output_strides_);
+  }
+};
+
+static_assert(std::is_trivial<CuFFTParams>::value, "");
+
+// Returns true if the transform type has complex input
+inline bool cufft_complex_input(CuFFTTransformType type) {
+  switch (type) {
+    case CuFFTTransformType::C2C:
+    case CuFFTTransformType::C2R:
+      return true;
+
+    case CuFFTTransformType::R2C:
+      return false;
+  }
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+// Returns true if the transform type has complex output
+inline bool cufft_complex_output(CuFFTTransformType type) {
+  switch (type) {
+    case CuFFTTransformType::C2C:
+    case CuFFTTransformType::R2C:
+      return true;
+
+    case CuFFTTransformType::C2R:
+      return false;
+  }
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+// Create transform type enum from bools representing if input and output are complex
+inline CuFFTTransformType GetCuFFTTransformType(bool complex_input, bool complex_output) {
+  if (complex_input && complex_output) {
+    return CuFFTTransformType::C2C;
+  } else if (complex_input && !complex_output) {
+    return CuFFTTransformType::C2R;
+  } else if (!complex_input && complex_output) {
+    return CuFFTTransformType::R2C;
+  }
+  TORCH_INTERNAL_ASSERT(false, "Real to real FFTs are not supported");
+}
+
+
+class CuFFTHandle {
+  ::cufftHandle handle_;
+public:
+
+  CuFFTHandle() {
+    CUFFT_CHECK(cufftCreate(&handle_));
+  }
+
+  ::cufftHandle & get() { return handle_; }
+  const ::cufftHandle & get() const { return handle_; }
+
+  ~CuFFTHandle() {
+// Not using fftDestroy() for rocFFT to work around double freeing of handles
+#if !defined(USE_ROCM)
+    cufftDestroy(handle_);
+#endif
+  }
+};
+
+__forceinline__
+static bool is_pow_of_two(int64_t x) {
+  return (x & (x - 1)) == 0;
+}
+
+using cufft_size_type = long long int;
+
+using CuFFTDimVector = c10::SmallVector<cufft_size_type, at::kDimVectorStaticSize>;
+
+// Struct representing a tensor in CuFFT's data layout for planning transforms
+// See NOTE [ cuFFT Embedded Strides ].
+struct CuFFTDataLayout {
+  CuFFTDimVector embed;
+  cufft_size_type stride, dist;
+  bool must_clone, simple;
+};
+
+// Returns a cufft embedding for a contiguous signal of the given size.
+// e.g. if the input is cloned, this will be the resulting data layout
+// See NOTE [ cuFFT Embedded Strides ].
+inline CuFFTDataLayout cufft_simple_embed(IntArrayRef sizes, bool onesided) {
+  CuFFTDataLayout layout;
+  layout.simple = true;
+  layout.must_clone = false;
+  layout.embed.assign(sizes.cbegin() + 1, sizes.cend());
+  if (onesided) {
+    layout.embed.back() = sizes.back() / 2 + 1;
+  }
+  layout.stride = 1;
+  layout.dist = 1;
+  for (const auto& len : layout.embed) {
+    layout.dist *= len;
+  }
+  return layout;
+}
+
+// Convert strides to a CuFFT embedded representation.
+// If strides cannot be embedded, returns a simple layout and sets must_clone flag
+// See NOTE [ cuFFT Embedded Strides ].
+inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bool onesided) {
+  const auto signal_ndim = strides.size() - 1;
+  CuFFTDataLayout layout;
+  auto last_stride = strides[signal_ndim];
+  layout.must_clone = (last_stride <= 0);
+
+  const auto last_dim_size = onesided ?
+      sizes[signal_ndim] / 2 + 1 : sizes[signal_ndim];
+  const auto signal_numel = c10::multiply_integers(sizes.slice(1, sizes.size() - 2)) * last_dim_size;
+
+  // Zero stides are not allowed, even if the batch size is one.
+  // If that happens just set a dummy case
+  if (sizes[0] == 1) {
+    layout.dist = signal_numel;
+  } else if (strides[0] == 0) {
+    layout.must_clone = true;
+  } else {
+    layout.dist = strides[0];
+  }
+
+  // Calculate the embedding shape, or set must_clone if the strides cannot be embedded
+  layout.embed.resize(signal_ndim);
+  for (auto i = signal_ndim - 1; !layout.must_clone && i > 0; i--) {
+    auto stride = strides[i];
+    if (sizes[i] == 1) {
+      layout.embed[i] = 1;
+    } else if (stride > 0 && stride % last_stride == 0) {
+      layout.embed[i] = stride / last_stride;
+      last_stride = stride;
+    } else {
+      layout.must_clone = true;
+    }
+  }
+
+  if (layout.must_clone) {
+    // If the input needs to be cloned, assume it will be contiguous
+    layout = cufft_simple_embed(sizes, onesided);
+    layout.must_clone = true;
+  } else {
+    layout.embed[0] = sizes[1];
+    layout.stride = strides[signal_ndim];
+    // Determine if layout represents a simple embedding (contiguous data)
+    layout.simple = [&] {
+      for (const auto i : c10::irange(1, signal_ndim - 1)) {
+        if (layout.embed[i] != sizes[i + 1]) {
+          return false;
+        }
+      }
+
+      return (layout.stride == 1 && layout.dist == signal_numel &&
+          layout.embed.back() == last_dim_size);
+    }();
+  }
+  return layout;
+}
+
+// This class contains all the information needed to execute a cuFFT plan:
+//   1. the plan
+//   2. whether to clone input before executing the plan
+//   3. the workspace size needed
+//
+// This class will be the **value** in the plan cache.
+// It **owns** the raw plan via a unique_ptr.
+class CuFFTConfig {
+public:
+
+  // Only move semantics is enought for this class. Although we already use
+  // unique_ptr for the plan, still remove copy constructor and assignment op so
+  // we don't accidentally copy and take perf hit.
+  CuFFTConfig(const CuFFTConfig&) = delete;
+  CuFFTConfig& operator=(CuFFTConfig const&) = delete;
+
+  explicit CuFFTConfig(const CuFFTParams& params):
+      CuFFTConfig(
+          IntArrayRef(params.input_strides_, params.signal_ndim_ + 1),
+          IntArrayRef(params.output_strides_, params.signal_ndim_ + 1),
+          IntArrayRef(params.sizes_, params.signal_ndim_ + 1),
+          params.fft_type_,
+          params.value_type_) {}
+
+  // For complex types, strides are in units of 2 * element_size(dtype)
+  // sizes are for the full signal, including batch size and always two-sided
+  CuFFTConfig(IntArrayRef in_strides, IntArrayRef out_strides,
+      IntArrayRef sizes, CuFFTTransformType fft_type, ScalarType dtype):
+        fft_type_(fft_type), value_type_(dtype) {
+
+    // signal sizes (excluding batch dim)
+    CuFFTDimVector signal_sizes(sizes.begin() + 1, sizes.end());
+
+    // input batch size
+    const int64_t batch = sizes[0];
+    const int64_t signal_ndim = sizes.size() - 1;
+
+    // Since cuFFT has limited non-unit stride support and various constraints, we
+    // use a flag to keep track throughout this function to see if we need to
+    // input = input.clone();
+
+#if defined(USE_ROCM)
+    // clone input to avoid issues with hipfft clobering the input and failing tests
+    clone_input = true;
+#else
+    clone_input = false;
+#endif
+
+    // For half, base strides on the real part of real-to-complex and
+    // complex-to-real transforms are not supported. Since our output is always
+    // contiguous, only need to check real-to-complex case.
+    if (dtype == ScalarType::Half) {
+      // cuFFT on half requires compute capability of at least SM_53
+      auto dev_prop = at::cuda::getCurrentDeviceProperties();
+      TORCH_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3),
+               "cuFFT doesn't support signals of half type with compute "
+               "capability less than SM_53, but the device containing input half "
+               "tensor only has SM_", dev_prop->major, dev_prop->minor);
+      for (const auto i : c10::irange(signal_ndim)) {
+        TORCH_CHECK(is_pow_of_two(sizes[i + 1]),
+            "cuFFT only supports dimensions whose sizes are powers of two when"
+            " computing in half precision, but got a signal size of",
+            sizes.slice(1));
+      }
+      clone_input |= in_strides.back() != 1;
+    }
+
+    CuFFTDataLayout in_layout;
+    if (clone_input) {
+      in_layout = cufft_simple_embed(sizes, fft_type == CuFFTTransformType::C2R);
+    } else {
+      in_layout = as_cufft_embed(in_strides, sizes, fft_type == CuFFTTransformType::C2R);
+    }
+    auto out_layout = as_cufft_embed(out_strides, sizes, fft_type == CuFFTTransformType::R2C);
+    TORCH_INTERNAL_ASSERT(!out_layout.must_clone, "Out strides cannot be represented as CuFFT embedding");
+    clone_input |= in_layout.must_clone;
+
+    // Check if we can take advantage of simple data layout.
+    //
+    // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+
+    const bool simple_layout = in_layout.simple && out_layout.simple;
+    cudaDataType itype, otype, exec_type;
+    const auto complex_input = cufft_complex_input(fft_type);
+    const auto complex_output = cufft_complex_output(fft_type);
+    if (dtype == ScalarType::Float) {
+      itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
+      otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
+      exec_type = CUDA_C_32F;
+    } else if (dtype == ScalarType::Double) {
+      itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
+      otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
+      exec_type = CUDA_C_64F;
+    } else if (dtype == ScalarType::Half) {
+      itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
+      otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
+      exec_type = CUDA_C_16F;
+    } else {
+      TORCH_CHECK(false, "cuFFT doesn't support tensor of type: ", dtype);
+    }
+
+    // disable auto allocation of workspace to use THC allocator
+    CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+
+    size_t ws_size_t;
+
+    // make plan
+    if (simple_layout) {
+      // If with unit-stride, we tell cuFFT by setting inembed == onembed == NULL.
+      // In such case, cuFFT ignores istride, ostride, idist, and odist
+      // by assuming istride = ostride = 1.
+      //
+      // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu.
+      CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
+        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
+        batch, &ws_size_t, exec_type));
+    } else {
+      CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
+            in_layout.embed.data(), in_layout.stride, in_layout.dist, itype,
+            out_layout.embed.data(), out_layout.stride, out_layout.dist, otype,
+            batch, &ws_size_t, exec_type));
+    }
+    ws_size = static_cast<int64_t>(ws_size_t);
+  }
+
+  const cufftHandle &plan() const { return plan_ptr.get(); }
+
+  CuFFTTransformType transform_type() const { return fft_type_; }
+  ScalarType data_type() const { return value_type_; }
+  bool should_clone_input() const { return clone_input; }
+  int64_t workspace_size() const { return ws_size; }
+
+private:
+  CuFFTHandle plan_ptr;
+  bool clone_input;
+  int64_t ws_size;
+  CuFFTTransformType fft_type_;
+  ScalarType value_type_;
+};
+
+#if defined(USE_ROCM)
+  // Note that the max plan number for CUDA version < 10 has to be 1023
+  // due to a bug that fails on the 1024th plan
+  constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023;
+  constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
+#else
+  constexpr int64_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<int64_t>::max();
+  // The default max cache size chosen for CUDA version > 10 is arbitrary.
+  // This number puts a limit on how big of a plan cache should we maintain by
+  // default. Users can always configure it via cufft_set_plan_cache_max_size.
+  constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
+#endif
+static_assert(0 <= CUFFT_MAX_PLAN_NUM && CUFFT_MAX_PLAN_NUM <= std::numeric_limits<int64_t>::max(),
+              "CUFFT_MAX_PLAN_NUM not in size_t range");
+static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
+              "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
+
+// This cache assumes that the mapping from key to value never changes.
+// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
+// value returned from try_emplace_value.
+// The contract of using this cache is that try_emplace_value should only be
+// used when the max_size is positive.
+class CuFFTParamsLRUCache {
+public:
+  using kv_t = typename std::pair<CuFFTParams, CuFFTConfig>;
+  using map_t = typename std::unordered_map<std::reference_wrapper<CuFFTParams>,
+                                            typename std::list<kv_t>::iterator,
+                                            ParamsHash<CuFFTParams>,
+                                            ParamsEqual<CuFFTParams>>;
+  using map_kkv_iter_t = typename map_t::iterator;
+
+
+  CuFFTParamsLRUCache() : CuFFTParamsLRUCache(CUFFT_DEFAULT_CACHE_SIZE) {}
+
+  CuFFTParamsLRUCache(int64_t max_size) {
+    _set_max_size(max_size);
+  }
+
+  CuFFTParamsLRUCache(CuFFTParamsLRUCache&& other) noexcept :
+    _usage_list(std::move(other._usage_list)),
+    _cache_map(std::move(other._cache_map)),
+    _max_size(other._max_size) {}
+
+  CuFFTParamsLRUCache& operator=(CuFFTParamsLRUCache&& other) noexcept {
+    _usage_list = std::move(other._usage_list);
+    _cache_map = std::move(other._cache_map);
+    _max_size = other._max_size;
+    return *this;
+  }
+
+  // If key is in this cache, return the cached config. Otherwise, emplace the
+  // config in this cache and return it.
+  // Return const reference because CuFFTConfig shouldn't be tampered with once
+  // created.
+  const CuFFTConfig &lookup(CuFFTParams params) {
+    AT_ASSERT(_max_size > 0);
+
+    map_kkv_iter_t map_it = _cache_map.find(params);
+    // Hit, put to list front
+    if (map_it != _cache_map.end()) {
+      _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
+      return map_it->second->second;
+    }
+
+    // Miss
+    // remove if needed
+    if (_usage_list.size() >= _max_size) {
+      auto last = _usage_list.end();
+      last--;
+      _cache_map.erase(last->first);
+      _usage_list.pop_back();
+    }
+
+    // construct new plan at list front, then insert into _cache_map
+    _usage_list.emplace_front(std::piecewise_construct,
+                       std::forward_as_tuple(params),
+                       std::forward_as_tuple(params));
+    auto kv_it = _usage_list.begin();
+    _cache_map.emplace(std::piecewise_construct,
+                std::forward_as_tuple(kv_it->first),
+                std::forward_as_tuple(kv_it));
+    return kv_it->second;
+  }
+
+  void clear() {
+    _cache_map.clear();
+    _usage_list.clear();
+  }
+
+  void resize(int64_t new_size) {
+    _set_max_size(new_size);
+    auto cur_size = _usage_list.size();
+    if (cur_size > _max_size) {
+      auto delete_it = _usage_list.end();
+      for (size_t i = 0; i < cur_size - _max_size; i++) {
+        delete_it--;
+        _cache_map.erase(delete_it->first);
+      }
+      _usage_list.erase(delete_it, _usage_list.end());
+    }
+  }
+
+  size_t size() const { return _cache_map.size(); }
+
+  size_t max_size() const noexcept { return _max_size; }
+
+  std::mutex mutex;
+
+private:
+  // Only sets size and does value check. Does not resize the data structures.
+  void _set_max_size(int64_t new_size) {
+    // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
+    // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
+    // first.
+    TORCH_CHECK(new_size >= 0,
+             "cuFFT plan cache size must be non-negative, but got ", new_size);
+    TORCH_CHECK(new_size <= CUFFT_MAX_PLAN_NUM,
+             "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size);
+    _max_size = static_cast<size_t>(new_size);
+  }
+
+  std::list<kv_t> _usage_list;
+  map_t _cache_map;
+  size_t _max_size;
+};
+
+// Since ATen is separated into CPU build and CUDA build, we need a way to call
+// these functions only when CUDA is loaded. We use CUDA hooks for this purpose
+// (at cuda/detail/CUDAHooks.cpp), and call the hooked functions from the actual
+// native function counterparts (at native/SpectralOps.cpp), i.e.,
+// _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size
+// _cufft_get_plan_cache_size, and _cufft_clear_plan_cache.
+int64_t cufft_get_plan_cache_max_size_impl(DeviceIndex device_index);
+void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_size);
+int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index);
+void cufft_clear_plan_cache_impl(DeviceIndex device_index);
+
+}}} // namespace at::native::detail
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0d8d45c1defb90af4da7d2c39d914d3d88ddafc3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh
@@ -0,0 +1,22 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/cuda/Atomic.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/TensorUtils.h>
+
+namespace at {
+namespace native {
+
+Tensor embedding_backward_cuda_kernel(
+    const Tensor &grad,
+    const Tensor &orig_indices,
+    const Tensor &sorted_indices,
+    const Tensor &count,
+    int64_t num_weights,
+    int padding_idx = -1,
+    bool mode_mean = false,
+    const Tensor &offset2bag = Tensor(),
+    const Tensor &bag_size = Tensor(),
+    const Tensor &per_sample_weights = Tensor());
+
+}}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9b08911b1d9500f200d55e031ffff7658d5491f0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ForeachMinMaxFunctors.cuh
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <ATen/NumericUtils.h>
+
+namespace at::native {
+
+// std:: does not have clamp functors
+template <typename T>
+struct minimum {
+  __device__ T operator()(const T& a, const T& b) const {
+    return (_isnan(a) || a < b) ? a : b;
+  }
+};
+
+template <typename T>
+struct maximum {
+  __device__ T operator()(const T& a, const T& b) const {
+    return (_isnan(a) || a > b) ? a : b;
+  }
+};
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..aace9c30b0a7e9d08de71c4baf1490d45ff6d36e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_grid_sampler_2d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_3d_forward_kernel(
+    const TensorBase &output, const TensorBase &input, const TensorBase &grid,
+    int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+
+void launch_grid_sampler_2d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+void launch_grid_sampler_3d_backward_kernel(
+    const TensorBase &grad_input, const TensorBase &grad_grid,
+    const TensorBase &grad_output, const TensorBase &input,
+    const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode,
+    bool align_corners, std::array<bool, 2> output_mask);
+
+}}  // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6f350c550ce93667bdf21b8b0c7d9798fdf5f15f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh
@@ -0,0 +1,187 @@
+#pragma once
+
+#include <ATen/jit_macros.h>
+
+#if AT_USE_JITERATOR()
+
+#include <ATen/cuda/CUDAConfig.h>
+
+#include <ATen/OpMathType.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+#include <ATen/native/cuda/CUDAJitLoops.cuh>
+
+namespace at {
+namespace native {
+
+/* Note [Jiterator]
+The "jiterator" simply just-in-time compiles the same kernels that
+Loops.cuh (and CUDALoops.cuh) usually build. This reduces build time,
+build size, and initial CUDA context size.
+
+By default on non-Windows systems, it also caches compiled kernels in ~/.cache/torch/kernels.
+This behavior is controlled with two environment variables:
+  - USE_PYTORCH_KERNEL_CACHE, if set to zero then this will disable all cache use
+  - PYTORCH_KERNEL_CACHE_PATH, if set specifies the folder to use for cached kernels
+
+The jiterator currently has some limitations, however. It cannot:
+  - handle math on complex datatypes
+  - handle kernels with scalar parameters
+
+These improvements will likely come soon.
+
+For examples of how to use the jiterator see the i1 and gcd kernel
+implementations, which pass jittable strings implementing their
+operations instead of the typical CUDA functors.
+
+To pass a runtime argument (similar to lambda captures in non-JIT kernels),
+we need to pass to additional arguments to `jitted_gpu_kernel` by value.
+Currently only primitive C++ types used for computation are valid.
+The order of these extra arguments should be same as the order they appear
+in kernel's function signature. (look at polygamma for example)
+
+NOTE: One big restriction being that these arguments should be after the
+arguments provided by TensorIterator. Eg. While capturing `n`, where
+`scalar_t x` and `scalar_t y` are provided by TensorIterator,
+* foo(scalar_t x, scalar_t y, int n) works!
+* foo(int n, scalar_t x, scalar_y) doesn't work
+* foo(scalar_t x, int n, scalar_y) doesn't work
+
+*/
+
+// Entrypoint for jitted GPU kernels.
+// Only handles elementwise unary and binary kernels with a
+//   common dtype and a single output.
+// NOTE: this assumes the op's iterator has a common_dtype.
+// NOTE: We use std::tuple instead of parameter pack
+//  for `extra_args` due to following
+// bug on older versions of clang
+// https://bugs.llvm.org/show_bug.cgi?id=23029
+template <
+    char const* name,
+    typename return_type,
+    typename f_inputs_type,
+    int arity,
+    typename... Args>
+void jitted_gpu_kernel(
+    TensorIteratorBase& iter,
+    const std::string& f,
+    at::cuda::jit::BinaryFuncVariant scalar_pos =
+        at::cuda::jit::BinaryFuncVariant::NoScalar,
+    at::opmath_type<f_inputs_type> scalar_val = 0,
+    std::tuple<Args...> extra_args = std::make_tuple()) {
+  // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel
+  //   Maybe it could be refactored?
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_cuda(),
+      "argument ", arg, ": expected a CUDA device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      jitted_gpu_kernel<name, return_type, f_inputs_type, arity>(
+          sub_iter, f, scalar_pos, scalar_val, extra_args);
+    }
+
+    return;
+  }
+
+  // Computes if dynamic casting is needed
+  // Dynamic casting is needed if an input's dtype differs from the common dtype
+  //   or if the result dtype differs from the output's dtype
+  // Note: this is intentionally divergent from calling needs_dynamic_casting,
+  //   which is more general and inspects a lambda to determine if dynamic
+  //   casting is needed.
+  bool needs_dynamic_casting = false;
+
+  // Checks output
+  const ScalarType return_scalar_type = c10::CppTypeToScalarType<return_type>::value;
+  const auto dtype0 = iter.dtype(0);
+  if (dtype0 != return_scalar_type) {
+    needs_dynamic_casting = true;
+  }
+
+  // Checks input(s)
+  const ScalarType inputs_scalar_type = c10::CppTypeToScalarType<f_inputs_type>::value;
+  for (auto i = decltype(arity){1}; i < (arity + 1); ++i) {
+    const auto dtypei = iter.dtype(i);
+    if (dtypei != inputs_scalar_type) {
+      needs_dynamic_casting = true;
+      break;
+    }
+  }
+  if (scalar_pos == at::cuda::jit::BinaryFuncVariant::NoScalar) {
+    // NOTE: With `scalar_pos=NoScalar`,`scalar_val` is not used
+    // for computation in the generated code and hence we pass a dummy
+    // value of `0`.
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::cuda::jit::BinaryFuncVariant::NoScalar>(
+        iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args);
+  } else if (scalar_pos == at::cuda::jit::BinaryFuncVariant::RhsScalar) {
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::cuda::jit::BinaryFuncVariant::RhsScalar>(
+        iter,
+        f,
+        needs_dynamic_casting,
+        scalar_val,
+        extra_args);
+
+  } else {
+    jitted_gpu_kernel_impl<
+        /*name*/ name,
+        /*return_type=*/return_type,
+        /*f_inputs_type=*/f_inputs_type,
+        arity,
+        at::cuda::jit::BinaryFuncVariant::LhsScalar>(
+        iter,
+        f,
+        needs_dynamic_casting,
+        scalar_val,
+        extra_args);
+  }
+}
+
+// TODO: support runtime state capture similar to `jitted_gpu_kernel`.
+template <char const *name, typename return_type, typename f_inputs_type>
+void opmath_jitted_gpu_kernel_with_scalars(TensorIteratorBase& iter, const std::string& f) {
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+  //currently jiterator only handles binary functions where both inputs are of the same type (f_inputs_type)
+  using opmath_t = at::opmath_type<f_inputs_type>;
+  if (iter.is_cpu_scalar(1)) {
+    auto scalar_val = iter.scalar_value<opmath_t>(1);
+    iter.remove_operand(1);
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    const OptionalDeviceGuard device_guard(iter.device(1));
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 1>(iter, f, at::cuda::jit::BinaryFuncVariant::LhsScalar, scalar_val);
+  } else if (iter.is_cpu_scalar(2)) {
+    auto scalar_val = iter.scalar_value<opmath_t>(2);
+    iter.remove_operand(2);
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 1>(iter, f, at::cuda::jit::BinaryFuncVariant::RhsScalar, scalar_val);
+  } else {
+    jitted_gpu_kernel<name, return_type, f_inputs_type, 2>(iter, f);
+  }
+}
+
+}}  // at::native
+
+#endif // AT_USE_JITERATOR()
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Loops.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Loops.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..cb14f275e21718db44bc5f175ce6e650426966ff
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Loops.cuh
@@ -0,0 +1,326 @@
+#pragma once
+
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/OpMathType.h>
+#include <ATen/native/cuda/thread_constants.h>
+
+#include <thrust/tuple.h>
+
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+
+namespace at { namespace native {
+
+template<int N>
+static OffsetCalculator<N> make_input_offset_calculator(const TensorIteratorBase& iter) {
+  // array size can not be 0, this happens when N == 0
+  constexpr int array_size = std::max<int>(N, 1);
+  TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs());
+  std::array<const int64_t*, array_size> strides;
+  int64_t element_sizes[array_size];
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i + iter.noutputs()).data();
+    element_sizes[i] = iter.element_size(i + iter.noutputs());
+  }
+  return OffsetCalculator<N>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+
+template <int num_outputs = 1>
+static OffsetCalculator<num_outputs> make_output_offset_calculator(const TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs());
+  std::array<const int64_t*, num_outputs> strides;
+  int64_t element_sizes[num_outputs];
+  for (int i = 0; i < num_outputs; i++) {
+    strides[i] = iter.strides(i).data();
+    element_sizes[i] = iter.element_size(i);
+  }
+  return OffsetCalculator<num_outputs>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+
+template<typename func_t, typename policy_t>
+__device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
+  using traits = function_traits<func_t>;
+  using return_t = typename traits::result_type;
+  using args_t = typename traits::ArgsTuple;
+
+  int idx = blockIdx.x;
+
+  return_t results[thread_work_size()];
+  args_t args[thread_work_size()];
+
+  // load
+  policy.load(args, idx);
+
+  // compute
+  #pragma unroll
+  for (int i = 0; i < thread_work_size(); i++) {
+    if (policy.check_inbounds(i)) {
+      results[i] = c10::guts::apply(f, args[i]);
+    }
+  }
+
+  // store
+  policy.store(results, idx);
+}
+
+}}  // namespace at::native
+
+#include <ATen/native/cuda/CUDALoops.cuh>
+
+namespace at:: native {
+
+template <typename func_t>
+void gpu_kernel_nocast(TensorIteratorBase& iter, const func_t& f) {
+
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_cuda(),
+      "argument ", arg, ": expected a CUDA device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel_nocast(sub_iter, f);
+    }
+    return;
+  }
+
+  gpu_kernel_impl_nocast(iter, f);
+}
+
+template <typename func_t>
+void gpu_kernel(TensorIteratorBase& iter, const func_t& f) {
+
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(
+      iter.device(arg).is_cuda(),
+      "argument ", arg, ": expected a CUDA device but found ", iter.device(arg));
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel(sub_iter, f);
+    }
+    return;
+  }
+
+  gpu_kernel_impl(iter, f);
+}
+
+template<typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct AUnaryFunctor {
+  using traits = function_traits<func_t>;
+  using opmath_arg1_t = typename traits::template arg<0>::type;
+  __device__ return_t operator()(arg2_t b) const {
+    return f(a, b);
+  }
+  // NB: scalar is stored in higher precision!
+  AUnaryFunctor(func_t f_, opmath_arg1_t a_): f(f_), a(a_) {}
+  private:
+    func_t f;
+    opmath_arg1_t a;
+};
+
+template<typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct BUnaryFunctor {
+  using traits = function_traits<func_t>;
+  using opmath_arg2_t = typename traits::template arg<1>::type;
+  __device__ return_t operator()(arg1_t a) const {
+    return f(a, b);
+  }
+  // NB: scalar is stored in higher precision!
+  BUnaryFunctor(func_t f_, opmath_arg2_t b_): f(f_), b(b_) {}
+  private:
+    func_t f;
+    opmath_arg2_t b;
+};
+
+// Though seemingly noop, this inserts casts from arg1_t to func_t's type
+// (which may be higher precision), as well as casts to return_t
+template <typename arg1_t, typename arg2_t, typename return_t, typename func_t>
+struct BinaryFunctor {
+  __device__ return_t operator()(arg1_t a, arg2_t b) const {
+    return f(a, b);
+  }
+  BinaryFunctor(func_t f_): f(f_) {}
+  private:
+    func_t f;
+};
+
+// Unlike gpu_kernel_with_scalars, this allows you to pass a func_t which
+// accepts inputs at higher precision (typically opmath_t), but then
+// ensure that we load from memory at the correct precision (scalar_t)
+// to avoid expensive loads.  For the whole sordid story see
+// https://dev-discuss.pytorch.org/t/cuda-loops-case-study-code-generation-vs-templates/302
+template <typename arg1_t, typename arg2_t = arg1_t, typename return_t = arg1_t, typename func_t>
+void opmath_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+
+  using traits = function_traits<func_t>;
+  using opmath_arg1_t = typename traits::template arg<0>::type;
+  using opmath_arg2_t = typename traits::template arg<1>::type;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+
+  if (iter.is_cpu_scalar(1)) {
+    AUnaryFunctor<arg1_t, arg2_t, return_t, func_t> af(f, iter.scalar_value<opmath_arg1_t>(1));
+    iter.remove_operand(1);
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    const OptionalDeviceGuard device_guard(iter.device(1));
+    gpu_kernel(iter, af);
+  } else if (iter.is_cpu_scalar(2)) {
+    BUnaryFunctor<arg1_t, arg2_t, return_t, func_t> bf(f, iter.scalar_value<opmath_arg2_t>(2));
+    iter.remove_operand(2);
+    gpu_kernel(iter, bf);
+  } else {
+    gpu_kernel(iter, BinaryFunctor<arg1_t, arg2_t, return_t, func_t>(f));
+  }
+}
+
+template <typename scalar_t, typename return_t = scalar_t, typename func_t>
+void opmath_symmetric_gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  // Use symmetric property of the functor to reduce number of kernels,
+  // requires f(a, b) == f(b, a)
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == 3);
+
+  using traits = function_traits<func_t>;
+  using opmath_arg_t = typename traits::template arg<0>::type;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+  static_assert(std::is_same<opmath_arg_t, typename traits::template arg<1>::type>::value,
+                "f is not symmetric");
+
+  OptionalDeviceGuard device_guard;
+  opmath_arg_t scalar_val{};
+
+  if (iter.is_cpu_scalar(1)) {
+    scalar_val = iter.scalar_value<opmath_arg_t>(1);
+    iter.remove_operand(1);
+
+    // TODO: When all kernels that use gpu_kernel_with_scalars are
+    // ported to structured, this device guard can be deleted.  This
+    // works around incorrect device guard generation for pre-structured
+    // kernels device guards, but structured kernels do it right and
+    // we can assume the device is already set correctly
+    device_guard.reset_device(iter.device(1));
+  } else if (iter.is_cpu_scalar(2)) {
+    scalar_val = iter.scalar_value<opmath_arg_t>(2);
+    iter.remove_operand(2);
+  }
+
+  if (iter.ninputs() == 2) {
+    gpu_kernel(iter, BinaryFunctor<scalar_t, scalar_t, return_t, func_t>(f));
+  } else {
+    AUnaryFunctor<scalar_t, scalar_t, return_t, func_t> unary_f(f, scalar_val);
+    gpu_kernel(iter, unary_f);
+  }
+}
+
+// Legacy variant that assumes that func_t has the correct types
+// that we expect to load from memory
+template <typename func_t>
+void gpu_kernel_with_scalars(TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  static_assert(
+      traits::arity == 2,
+      "gpu_kernel_with_scalars only supports two input arguments");
+  using arg1_t = typename traits::template arg<0>::type;
+  using arg2_t = typename traits::template arg<1>::type;
+  using return_t = typename traits::result_type;
+  opmath_gpu_kernel_with_scalars<arg1_t, arg2_t, return_t, func_t>(iter, f);
+}
+
+namespace { // functions for `gpu_kernel_multiple_outputs`.
+
+// check the return type is `thrust::tuple`, not `std::tuple`.
+template <typename T> struct is_tuple: std::false_type {};
+
+template <typename ...T> struct is_tuple<thrust::tuple<T...>>: std::true_type {};
+
+template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void unrolled_elementwise_kernel_for_multi_outputs(int N, func_t f, array_t data, inp_calc_t ic, out_calc_t oc) {
+  int remaining = N - block_work_size() * blockIdx.x;
+  elementwise_kernel_helper(f, memory::policies::multi_outputs_unroll<array_t, inp_calc_t, out_calc_t, num_outputs>(data, remaining, ic, oc));
+}
+
+template <int num_outputs, typename func_t, typename array_t, typename inp_calc_t, typename out_calc_t>
+static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const func_t& f, array_t data, inp_calc_t ic, out_calc_t oc) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  int64_t grid = (N + block_work_size() - 1) / block_work_size();
+  auto stream = at::cuda::getCurrentCUDAStream();
+  unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads(), 0, stream>>>(N, f, data, ic, oc);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+
+template <typename func_t>
+void gpu_kernel_multiple_outputs_impl(TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  using output_t = typename traits::result_type;
+  static_assert(is_tuple<output_t>::value, "f's return type must be `thrust::tuple`");
+  constexpr int num_outputs = thrust::tuple_size<output_t>::value;
+  constexpr int num_inputs = traits::arity;
+  constexpr int ntensors = num_outputs + num_inputs;
+
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == ntensors);
+
+  at::detail::Array<char*, ntensors> data;
+  for (int i = 0; i < ntensors; i++) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+
+  int64_t numel = iter.numel();
+
+  if (iter.is_contiguous()) {
+    auto input_calc = TrivialOffsetCalculator<num_inputs>();
+    auto output_calc = TrivialOffsetCalculator<num_outputs>();
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  } else {
+    auto input_calc = make_input_offset_calculator<num_inputs>(iter);
+    auto output_calc = make_output_offset_calculator<num_outputs>(iter);
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  }
+}
+} // namespace
+
+template <typename func_t>
+void gpu_kernel_multiple_outputs(TensorIteratorBase& iter, const func_t& f) {
+  ASSERT_HOST_DEVICE_LAMBDA(func_t);
+
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(iter.device(arg).is_cuda());
+  }
+
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel_multiple_outputs(sub_iter, f);
+    }
+    return;
+  }
+
+  gpu_kernel_multiple_outputs_impl(iter, f);
+}
+
+} //namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Math.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Math.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5188801f12c60b67d2da3486d02ced455a17231c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Math.cuh
@@ -0,0 +1,3375 @@
+#pragma once
+
+#include <ATen/AccumulateType.h>
+#include <ATen/jit_macros.h>
+#include <c10/macros/Macros.h>
+#include <ATen/native/cuda/jit_utils.h>
+
+namespace at {
+namespace native {
+// See note [Jiterator]
+// TODO: elaborate in this comment on the structure of math.cuh
+#if AT_USE_JITERATOR()
+
+const auto ndtri_string = jiterator_stringify(
+  /*
+  * This function is derived from the implementation of the digamma function in the Cephes Math Library.
+  * See note [3-Clause BSD License for the Cephes Math Library].
+  *
+  * Evaluates polynomial of degree N:
+  *
+  *                     2          N
+  * y  =  C  + C x + C x  +...+ C x
+  *        0    1     2          N
+  *
+  * Coefficients are stored in reverse order:
+  *
+  * coef[0] = C  , ..., coef[N] = C  .
+  *            N                   0
+  */
+  template <typename T>
+  T polevl(const T x, const T A[], const int len) {
+    // NOTE: This `polevl` is different from other `polevl`
+    // implementation (in PyTorch) which expect the `len` to be
+    // `len(A) - 1` instead of `len(A)`.
+    T result = 0;
+    for (int i = 0; i < len; ++i) {
+      result = result * x + A[i];
+    }
+    return result;
+  }
+
+  /*
+  * This function is derived from the implementation of the i1e function in the Cephes Math Library.
+  * See note [3-Clause BSD License for the Cephes Math Library].
+  *
+  * Computes the argument, x, for which the area under the Gaussian probability density function
+  * (integrated from minus infinity to x) is equal to y.
+  */
+  template <typename T>
+  T ndtri(T y0) {
+
+    constexpr T zero = 0;
+    constexpr T one = 1;
+
+    // Handles special cases
+    if (y0 == zero) {
+      return NEG_INFINITY;
+    }
+    if (y0 == one) {
+      return POS_INFINITY;
+    }
+    if (y0 < zero || y0 > one) {
+      return NAN;
+    }
+
+    bool code = true;
+    T y = y0;
+    // Note: the constant 0.135... is equal to exp(-2)
+    if (y > one - T{0.13533528323661269189}) {
+      y = one - y;
+      code = false;
+    }
+
+    if (y > T{0.13533528323661269189}) {
+      /* approximation for 0 <= |y - 0.5| <= 3/8 */
+      static const T P0[5] = {
+          -5.99633501014107895267E1,
+          9.80010754185999661536E1,
+          -5.66762857469070293439E1,
+          1.39312609387279679503E1,
+          -1.23916583867381258016E0,
+      };
+
+      static const T Q0[9] = {
+        1.00000000000000000000E0,
+        1.95448858338141759834E0,
+        4.67627912898881538453E0,
+        8.63602421390890590575E1,
+        -2.25462687854119370527E2,
+        2.00260212380060660359E2,
+        -8.20372256168333339912E1,
+        1.59056225126211695515E1,
+        -1.18331621121330003142E0,
+      };
+
+      /* sqrt(2pi) */
+      constexpr T s2pi = 2.50662827463100050242E0;
+
+      y = y - T{0.5};
+      const T y2 = y * y;
+      T x = y + y * (y2 * polevl(y2, P0, int{5}) / polevl(y2, Q0, int{9}));
+      return x * s2pi;
+    }
+
+    T x = sqrt(T{-2.} * log(y));
+    const T x0 = x - (log(x) / x);
+
+    const T z = one / x;
+    T x1;
+
+    /* y > exp(-32) = 1.2664165549e-14 */
+    if (x < T{8.0}) {
+      /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8
+      * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14.
+      */
+      static const T P1[9] = {
+        4.05544892305962419923E0,
+        3.15251094599893866154E1,
+        5.71628192246421288162E1,
+        4.40805073893200834700E1,
+        1.46849561928858024014E1,
+        2.18663306850790267539E0,
+        -1.40256079171354495875E-1,
+        -3.50424626827848203418E-2,
+        -8.57456785154685413611E-4,
+      };
+
+      static const T Q1[9] = {
+        1.00000000000000000000E0,
+        1.57799883256466749731E1,
+        4.53907635128879210584E1,
+        4.13172038254672030440E1,
+        1.50425385692907503408E1,
+        2.50464946208309415979E0,
+        -1.42182922854787788574E-1,
+        -3.80806407691578277194E-2,
+        -9.33259480895457427372E-4,
+      };
+
+      x1 = z * polevl(z, P1, int{9}) / polevl(z, Q1, int{9});
+    } else {
+      /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64
+      * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+      */
+      static const T P2[9] = {
+        3.23774891776946035970E0,
+        6.91522889068984211695E0,
+        3.93881025292474443415E0,
+        1.33303460815807542389E0,
+        2.01485389549179081538E-1,
+        1.23716634817820021358E-2,
+        3.01581553508235416007E-4,
+        2.65806974686737550832E-6,
+        6.23974539184983293730E-9,
+      };
+
+      static const T Q2[9] = {
+        1.00000000000000000000E0,
+        6.02427039364742014255E0,
+        3.67983563856160859403E0,
+        1.37702099489081330271E0,
+        2.16236993594496635890E-1,
+        1.34204006088543189037E-2,
+        3.28014464682127739104E-4,
+        2.89247864745380683936E-6,
+        6.79019408009981274425E-9,
+      };
+
+      x1 = z * polevl(z, P2, int{9}) / polevl(z, Q2, int{9});
+    }
+
+    x = x0 - x1;
+    return (!code) ? x : -x;
+  }
+); // ndtri_string
+
+const auto log_ndtr_string = jiterator_stringify(
+  template <typename T>
+  T log_ndtr(T x) {
+    constexpr T SQRT1_2{0.707106781186547524400844362104849039};   // 1/sqrt(2)
+    T t = x * SQRT1_2;
+    if (x < T{-1.0}) {
+      return log(erfcx(-t) / 2) - t * t;
+    } else {
+      return log1p(-erfc(t) / 2);
+    }
+  }
+); // log_ndtr_string
+
+const auto gcd_string = jiterator_stringify(
+  template <typename T>
+  T gcd(const T a_in, const T b_in) {
+    T a = abs(a_in);
+    T b = abs(b_in);
+
+    while (a != T{0}) {
+      T c = a;
+      a = b % a;
+      b = c;
+    }
+
+    return b;
+  }
+); // gcd_string
+
+const auto lcm_string = jiterator_stringify(
+  template <typename T>
+  T gcd(const T a_in, const T b_in) {
+    T a = abs(a_in);
+    T b = abs(b_in);
+
+    while (a != T{0}) {
+      T c = a;
+      a = b % a;
+      b = c;
+    }
+
+    return b;
+  }
+
+  template <typename T>
+  T lcm(const T a, const T b) {
+    T g = gcd(a, b);
+    return (g == T{0}) ? T{0} : abs(a / g * b);
+  }
+); // lcm_string
+
+/*
+ * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h".
+ */
+// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
+const auto digamma_string = jiterator_stringify(
+  template <typename T>
+  T digamma(T x) {
+    static const double PI_f64 = 3.14159265358979323846;
+
+    // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard
+    if (x == 0) {
+      return copysign(POS_INFINITY, -x);
+    }
+
+    T result = 0;
+    if (x < 0) {
+      // Short-circuits if x is a negative integer and returns NaN
+      //   per the C++ standard
+      const bool x_is_integer = (x == trunc(x));
+      if (x_is_integer) {
+        return NAN;
+      }
+
+      // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
+      // accurate than tan(pi * x). While these operations are mathematically equivalent
+      // since both x and r are in radians and tan() has a periodicity of pi, in practice
+      // the computation of pi * x is a source of error (when |x| > 1).
+      double q, r;
+      r = modf(static_cast<double>(x), &q);
+      result = - PI_f64 / tan(PI_f64 * r);
+      x = 1 - x;
+    }
+
+    while (x < T{10}) {
+      result -= T{1} / x;
+      x += T{1};
+    }
+
+    if (x == T{10}) {
+      return result + T{2.25175258906672110764};
+    }
+
+    T y = 0;
+    if (x < T{1.0e17}) {
+      const T A[] = {
+        8.33333333333333333333E-2,
+        -2.10927960927960927961E-2,
+        7.57575757575757575758E-3,
+        -4.16666666666666666667E-3,
+        3.96825396825396825397E-3,
+        -8.33333333333333333333E-3,
+        8.33333333333333333333E-2,
+      };
+
+
+      T z = T{1} / (x * x);
+
+      T polevl_result = 0;
+      for (int i = 0; i <= 6; i++) {
+        polevl_result = polevl_result * z + A[i];
+      }
+      y = z * polevl_result;
+    }
+
+    return log(x) - (T{0.5} / x) - y + result;
+  }
+); // digamma_string
+
+/*
+ * This function is derived from the implementation of the zeta function in the Cephes Math Library.
+ * See note [3-Clause BSD License for the Cephes Math Library].
+ */
+const auto zeta_string = jiterator_stringify(
+  template <typename T>
+  T zeta(T x, T q) {
+    const T MACHEP{1.11022302462515654042E-16};
+    constexpr T zero{0};
+    constexpr T half{0.5};
+    constexpr T one{1};
+    static const T A[] = {
+        12.0,
+        -720.0,
+        30240.0,
+        -1209600.0,
+        47900160.0,
+        -1.8924375803183791606e9, /*1.307674368e12/691*/
+        7.47242496e10,
+        -2.950130727918164224e12, /*1.067062284288e16/3617*/
+        1.1646782814350067249e14, /*5.109094217170944e18/43867*/
+        -4.5979787224074726105e15, /*8.028576626982912e20/174611*/
+        1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/
+        -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/
+    };
+
+    int i = 0;
+    T a, b, k, s, t, w;
+
+    // Short-circuits x -> +infty
+    if (x == one) {
+      return POS_INFINITY;
+    }
+
+    // Short-circuits x < 1 -> NaN
+    if (x < one) {
+      return NAN;
+    }
+
+    // Short-circuits negative q integers map to +infty,
+    //   negative q non-integers map to NaN
+    if (q <= zero) {
+      if (q == floor(q)) {
+        return POS_INFINITY;
+      }
+      if (x != floor(x)) {
+        return NAN;
+      }
+    }
+
+    s = pow(q, -x);
+    a = q;
+    i = 0;
+    b = zero;
+    while ((i < 9) || (a <= T{9.0})) {
+      i += 1;
+      a += one;
+      b = pow(a, -x);
+      s += b;
+      if ((-MACHEP * s < b) && (b < MACHEP * s)) {
+        return s;
+      }
+    };
+
+    w = a;
+    s += b * w / (x - one);
+    s -= half * b;
+    a = one;
+    k = zero;
+    for (int i = 0; i < 12; i++) {
+      a *= x + k;
+      b /= w;
+      t = a * b / A[i];
+      s = s + t;
+      t = fabs(t / s);
+
+      if (t < MACHEP) {
+        return s;
+      }
+
+      k += one;
+      a *= x + k;
+      b /= w;
+      k += one;
+    }
+
+    return s;
+  }
+); // zeta_string
+
+const auto trigamma_string = jiterator_stringify(
+  template <typename T>
+  T trigamma(T x) {
+    const T PI{3.14159265358979323846};
+    T sign = 1;
+    T result = 0;
+
+    if (x < T{0.5}) {
+      sign = -1;
+      T sin_pi_x = sin(PI * x);
+      result -= (PI * PI) / (sin_pi_x * sin_pi_x);
+      x = 1 - x;
+    }
+
+    for (int i = 0; i < 6; ++i) {
+      result += T{1} / (x * x);
+      x += 1;
+    }
+
+    const T one{1};
+    const T ixx = one / (x*x);
+    result += (one + one / (T{2}*x) + ixx * (one/T{6} - ixx * (one/T{30} - ixx * (one/T{42})))) / x;
+    return sign * result;
+}
+); // trigamma_string
+
+const auto lgamma_string = jiterator_stringify(
+  template <typename T>
+  T lgamma_kernel(T a) {
+    return lgamma(a);
+  }
+); // lgamma_string
+
+const auto polygamma_string = zeta_string + jiterator_stringify(
+  template <typename T>
+  T polygamma(T x, int n) {
+    // already blocked if n <= 1
+    const auto one = T{1};
+    return ((n % 2) ? one : -one) * exp(lgamma(static_cast<T>(n) + one)) *
+        zeta<T>(static_cast<T>(n + 1), x);
+  }
+); // polygamma_string
+
+const auto exp2_string = jiterator_stringify(
+  template <typename T>
+  T exp2_impl(T a) {
+    return exp2(a);
+  }
+
+  namespace std { template <typename _Ty> class complex; }
+  template <typename T>
+  std::complex<T> exp2_impl(std::complex<T> x) {
+    // There is no std::exp2 overload for complex, so instead
+    // use the identity 2^x = e^(ln(2) * x)
+    const auto ln_2 = static_cast<T>(0.693147180559945309417232121458176);
+    return exp(ln_2 * x);
+  }
+
+  template <typename T>
+  T exp2_kernel(T a) {
+    return exp2_impl(a);
+  }
+); // exp2_string
+
+const auto erfc_string = jiterator_stringify(
+  template <typename T>
+  T erfc_kernel(T a) {
+    return erfc(a);
+  }
+); // erfc_string
+
+const auto erfinv_string = jiterator_stringify(
+  template <typename T>
+  T erfinv_kernel(T a) {
+    return erfinv(a);
+  }
+); // erfinv_string
+
+const auto entr_string = jiterator_stringify(
+  template <typename T>
+  T entr(T a) {
+    if (a != a) {
+      return a;
+    }
+
+    if (a > 0) {
+      return -a * log(a);
+    }
+
+    if (a == 0) {
+      return 0;
+    }
+
+    return NEG_INFINITY;
+  }
+); // entr_string
+
+// NOTE: `kaiser_window_string` depends on `i0_string`
+//       for its implementation.
+const auto i0_string = jiterator_stringify(
+  template<typename T>
+  T chbevl(T x, const T array[], const int len) {
+
+      T b0, b1, b2;
+
+      b0 = array[0];
+      b1 = 0;
+
+      for (int i = 1; i < len; ++i)  {
+          b2 = b1;
+          b1 = b0;
+          b0 = x * b1 - b2 + array[i];
+      }
+
+      return T{0.5} * (b0 - b2);
+  }
+
+  template<typename T>
+  T i0(T _x) {
+      T x = fabs(_x);
+
+      if (x <= T{8.0}) {
+          /* Chebyshev coefficients for exp(-x) I0(x)
+          *   in the interval [0,8].
+          *
+          * lim(x->0){ exp(-x) I0(x) } = 1.
+          */
+          static const T A[] = {
+              -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+              -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+              -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+              -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+              -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+              -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+              -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+              -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+              -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+              -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+              -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+              -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+              -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+              -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+              -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+          T y = (x / T{2.0}) - T{2.0};
+          return exp(x) * chbevl(y, A, int{30});
+      }
+
+      // Handles x > 8 case
+      /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
+      * in the inverted interval [8,infinity].
+      *
+      * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
+      */
+      const T B[] = {
+          -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+          4.46562142029675999901E-17,  3.46122286769746109310E-17,
+          -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+          1.77256013305652638360E-15,  3.81168066935262242075E-15,
+          -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+          1.54008621752140982691E-14,  3.85277838274214270114E-13,
+          7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+          -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+          1.18891471078464383424E-11,  4.94060238822496958910E-10,
+          3.39623202570838634515E-9,   2.26666899049817806459E-8,
+          2.04891858946906374183E-7,   2.89137052083475648297E-6,
+          6.88975834691682398426E-5,   3.36911647825569408990E-3,
+          8.04490411014108831608E-1};
+
+      return (exp(x) * chbevl(T{32.0} / x - T{2.0}, B, int{25})) / sqrt(x);
+  }
+); // i0_string
+
+const auto i1_string = jiterator_stringify(
+  template<typename T>
+  T chbevl(const T x, const T array[], const int len) {
+      T b0, b1, b2;
+
+      b0 = array[0];
+      b1 = 0;
+
+      for (int i = 1; i < len; ++i)  {
+          b2 = b1;
+          b1 = b0;
+          b0 = x * b1 - b2 + array[i];
+      }
+
+      return T{0.5} * (b0 - b2);
+  }
+
+  template <typename T>
+  T i1(T _x) {
+    const T x = fabs(_x);
+
+    if (x <= T{8.0}) {
+      // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8]
+      //   lim(x->0){ exp(-x) i1(x) / x } = 1/2
+      static const T coefficients[] = {
+          2.77791411276104639959E-18, -2.11142121435816608115E-17,
+          1.55363195773620046921E-16, -1.10559694773538630805E-15,
+          7.60068429473540693410E-15, -5.04218550472791168711E-14,
+          3.22379336594557470981E-13, -1.98397439776494371520E-12,
+          1.17361862988909016308E-11, -6.66348972350202774223E-11,
+          3.62559028155211703701E-10, -1.88724975172282928790E-9,
+          9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+          2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+          3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+          4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+          5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+          4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+          2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+          1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+          2.52587186443633654823E-1};
+      const T y = x / T{2.0} - T{2.0};
+      const T out = exp(x) * x * chbevl(y, coefficients, int{29});
+      return (_x < T{0.0}) ? -out : out;
+    }
+
+    // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+    //   in the inverted interval [8, infinity]
+    //   lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi)
+    static const T coefficients[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+    const T out = (exp(x) * chbevl(T{32.} / x - T{2.}, coefficients, int{25})) / sqrt(x);
+    return (_x < T{0.}) ? -out : out;
+  }
+); // i1_string
+
+const auto i1e_string = jiterator_stringify(
+  template<typename T>
+  T chbevl(const T x, const T array[], const int len) {
+      T b0, b1, b2;
+
+      b0 = array[0];
+      b1 = 0;
+
+      for (int i = 1; i < len; ++i)  {
+          b2 = b1;
+          b1 = b0;
+          b0 = x * b1 - b2 + array[i];
+      }
+
+      return T{0.5} * (b0 - b2);
+  }
+
+  // See double and float instantiations below
+  template <typename T>
+  T i1e(T _x) { }
+
+  // Double specialization (uses different coefficients than the float version)
+  template<>
+  double i1e(double _x) {
+    const double x = fabs(_x);
+    if (x <= double{8.}) {
+      // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8].
+      // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2.
+      static const double coefficients[] = {
+        2.77791411276104639959E-18, -2.11142121435816608115E-17,
+        1.55363195773620046921E-16, -1.10559694773538630805E-15,
+        7.60068429473540693410E-15, -5.04218550472791168711E-14,
+        3.22379336594557470981E-13, -1.98397439776494371520E-12,
+        1.17361862988909016308E-11, -6.66348972350202774223E-11,
+        3.62559028155211703701E-10, -1.88724975172282928790E-9,
+        9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+        2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+        3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+        4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+        5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+        4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+        2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+        1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+        2.52587186443633654823E-1};
+      const double y = x / double{2.} - double{2.};
+      const double out = chbevl(y, coefficients, int{29}) * x;
+      return (_x < 0.) ? -out : out;
+    }
+
+    // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+    //   in the inverted interval (8, infinity].
+    // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi).
+    // TODO: what's an "inverted interval"? Open on the left
+    //   and closed on the right?
+  static const double coefficients[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+
+    const double out = chbevl(double{32.} / x - double{2.}, coefficients, int{25}) / sqrt(x);
+    return (_x < double{0.}) ? -out : out;
+  }
+
+  // Float specialization (uses different coefficients than the double version)
+  template<>
+  float i1e(float _x) {
+    const float x = fabsf(_x);
+    if (x <= float{8.}) {
+      // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8].
+      // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2.
+      static const float coefficients[] = {
+        9.38153738649577178388E-9f,
+        -4.44505912879632808065E-8f,
+        2.00329475355213526229E-7f,
+        -8.56872026469545474066E-7f,
+        3.47025130813767847674E-6f,
+        -1.32731636560394358279E-5f,
+        4.78156510755005422638E-5f,
+        -1.61760815825896745588E-4f,
+        5.12285956168575772895E-4f,
+        -1.51357245063125314899E-3f,
+        4.15642294431288815669E-3f,
+        -1.05640848946261981558E-2f,
+        2.47264490306265168283E-2f,
+        -5.29459812080949914269E-2f,
+        1.02643658689847095384E-1f,
+        -1.76416518357834055153E-1f,
+        2.52587186443633654823E-1f};
+      const float y = x / float{2.} - float{2.};
+      const float out = chbevl(y, coefficients, int{17}) * x;
+      return (_x < 0.) ? -out : out;
+    }
+
+    // Chebyshev coefficients for exp(-x) sqrt(x) i1(x)
+    //   in the inverted interval (8, infinity].
+    // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi).
+    // TODO: what's an "inverted interval"? Open on the left
+    //   and closed on the right?
+  static const float coefficients[] = {
+      -3.83538038596423702205E-9f,
+      -2.63146884688951950684E-8f,
+      -2.51223623787020892529E-7f,
+      -3.88256480887769039346E-6f,
+      -1.10588938762623716291E-4f,
+      -9.76109749136146840777E-3f,
+      7.78576235018280120474E-1f};
+
+    const float out = chbevl(float{32.} / x - float{2.}, coefficients, int{7}) / sqrt(x);
+    return (_x < float{0.}) ? -out : out;
+  }
+); // i1e_string
+
+const auto kaiser_window_string = i0_string + jiterator_stringify(
+  template <typename T>
+  T kaiser_window(T a, T inv_alpha, T beta, T inv_i0_beta) {
+    T x = a * inv_alpha - T{1};
+    T y = max(T{0}, T{1} - x * x);
+    return i0(beta * sqrt(y)) * inv_i0_beta;
+  }
+); // kaiser_window_string
+
+const auto sinc_string = jiterator_stringify(
+  template <typename T>
+  T sinc(T a) {
+    if (a == T(0)) {
+      return T(1);
+    } else {
+      constexpr T pi = T(3.14159265358979323846L);
+      T product = pi * a;
+      return std::sin(product) / product;
+    }
+  }
+); // sinc_string
+
+const auto erfcx_string = jiterator_stringify(
+  /* The next function is taken from http://ab-initio.mit.edu/Faddeev */
+
+  /* Copyright (c) 2012 Massachusetts Institute of Technology
+  *
+  * Permission is hereby granted, free of charge, to any person obtaining
+  * a copy of this software and associated documentation files (the
+  * "Software"), to deal in the Software without restriction, including
+  * without limitation the rights to use, copy, modify, merge, publish,
+  * distribute, sublicense, and/or sell copies of the Software, and to
+  * permit persons to whom the Software is furnished to do so, subject to
+  * the following conditions:
+  *
+  * The above copyright notice and this permission notice shall be
+  * included in all copies or substantial portions of the Software.
+  *
+  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+  */
+
+  /* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by
+    Steven G. Johnson, October 2012.
+
+    This function combines a few different ideas.
+
+    First, for x > 50, it uses a continued-fraction expansion (same as
+    for the Faddeeva function, but with algebraic simplifications for z=i*x).
+
+    Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations,
+    but with two twists:
+
+        a) It maps x to y = 4 / (4+x) in [0,1].  This simple transformation,
+          inspired by a similar transformation in the octave-forge/specfun
+          erfcx by Soren Hauberg, results in much faster Chebyshev convergence
+          than other simple transformations I have examined.
+
+        b) Instead of using a single Chebyshev polynomial for the entire
+          [0,1] y interval, we break the interval up into 100 equal
+          subintervals, with a switch/lookup table, and use much lower
+          degree Chebyshev polynomials in each subinterval. This greatly
+          improves performance in my tests.
+
+    For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x),
+    with the usual checks for overflow etcetera.
+
+    Performance-wise, it seems to be substantially faster than either
+    the SLATEC DERFC function [or an erfcx function derived therefrom]
+    or Cody's CALERF function (from netlib.org/specfun), while
+    retaining near machine precision in accuracy.
+  */
+
+  /* Given y100 = 100 * y, where y = 4 / (4 + x) for x >= 0, compute erfc(x).
+
+    Uses a look-up table of 100 different Chebyshev polynomials
+    for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated
+    with the help of Maple and a little shell script.   This allows
+    the Chebyshev polynomials to be of significantly lower degree (about 1/4)
+    compared to fitting the whole [0,1] interval with a single polynomial.
+  */
+
+  // TODO: review if this is computing in double when given a float input
+  template <typename T>
+  T erfcx_y100(T y100) {
+    switch (static_cast<int>(y100)) {
+      case 0: {
+      T t = 2*y100 - 1;
+      return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 1: {
+      T t = 2*y100 - 3;
+      return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 2: {
+      T t = 2*y100 - 5;
+      return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 3: {
+      T t = 2*y100 - 7;
+      return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 4: {
+      T t = 2*y100 - 9;
+      return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 5: {
+      T t = 2*y100 - 11;
+      return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 6: {
+      T t = 2*y100 - 13;
+      return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 7: {
+      T t = 2*y100 - 15;
+      return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 8: {
+      T t = 2*y100 - 17;
+      return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 9: {
+      T t = 2*y100 - 19;
+      return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 10: {
+      T t = 2*y100 - 21;
+      return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 11: {
+      T t = 2*y100 - 23;
+      return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 12: {
+      T t = 2*y100 - 25;
+      return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 13: {
+      T t = 2*y100 - 27;
+      return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 14: {
+      T t = 2*y100 - 29;
+      return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 15: {
+      T t = 2*y100 - 31;
+      return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 16: {
+      T t = 2*y100 - 33;
+      return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 17: {
+      T t = 2*y100 - 35;
+      return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 18: {
+      T t = 2*y100 - 37;
+      return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 19: {
+      T t = 2*y100 - 39;
+      return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 20: {
+      T t = 2*y100 - 41;
+      return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 21: {
+      T t = 2*y100 - 43;
+      return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 22: {
+      T t = 2*y100 - 45;
+      return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 23: {
+      T t = 2*y100 - 47;
+      return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 24: {
+      T t = 2*y100 - 49;
+      return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 25: {
+      T t = 2*y100 - 51;
+      return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 26: {
+      T t = 2*y100 - 53;
+      return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 27: {
+      T t = 2*y100 - 55;
+      return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 28: {
+      T t = 2*y100 - 57;
+      return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 29: {
+      T t = 2*y100 - 59;
+      return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 30: {
+      T t = 2*y100 - 61;
+      return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t;
+      }
+      case 31: {
+      T t = 2*y100 - 63;
+      return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 32: {
+      T t = 2*y100 - 65;
+      return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 33: {
+      T t = 2*y100 - 67;
+      return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 34: {
+      T t = 2*y100 - 69;
+      return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 35: {
+      T t = 2*y100 - 71;
+      return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 36: {
+      T t = 2*y100 - 73;
+      return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 37: {
+      T t = 2*y100 - 75;
+      return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 38: {
+      T t = 2*y100 - 77;
+      return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 39: {
+      T t = 2*y100 - 79;
+      return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 40: {
+      T t = 2*y100 - 81;
+      return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 41: {
+      T t = 2*y100 - 83;
+      return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 42: {
+      T t = 2*y100 - 85;
+      return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 43: {
+      T t = 2*y100 - 87;
+      return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 44: {
+      T t = 2*y100 - 89;
+      return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 45: {
+      T t = 2*y100 - 91;
+      return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 46: {
+      T t = 2*y100 - 93;
+      return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 47: {
+      T t = 2*y100 - 95;
+      return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 48: {
+      T t = 2*y100 - 97;
+      return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 49: {
+      T t = 2*y100 - 99;
+      return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 50: {
+      T t = 2*y100 - 101;
+      return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 51: {
+      T t = 2*y100 - 103;
+      return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 52: {
+      T t = 2*y100 - 105;
+      return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 53: {
+      T t = 2*y100 - 107;
+      return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 54: {
+      T t = 2*y100 - 109;
+      return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 55: {
+      T t = 2*y100 - 111;
+      return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 56: {
+      T t = 2*y100 - 113;
+      return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 57: {
+      T t = 2*y100 - 115;
+      return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 58: {
+      T t = 2*y100 - 117;
+      return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 59: {
+      T t = 2*y100 - 119;
+      return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 60: {
+      T t = 2*y100 - 121;
+      return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 61: {
+      T t = 2*y100 - 123;
+      return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 62: {
+      T t = 2*y100 - 125;
+      return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 63: {
+      T t = 2*y100 - 127;
+      return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 64: {
+      T t = 2*y100 - 129;
+      return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 65: {
+      T t = 2*y100 - 131;
+      return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 66: {
+      T t = 2*y100 - 133;
+      return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 67: {
+      T t = 2*y100 - 135;
+      return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 68: {
+      T t = 2*y100 - 137;
+      return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 69: {
+      T t = 2*y100 - 139;
+      return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 70: {
+      T t = 2*y100 - 141;
+      return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 71: {
+      T t = 2*y100 - 143;
+      return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 72: {
+      T t = 2*y100 - 145;
+      return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t;
+      }
+      case 73: {
+      T t = 2*y100 - 147;
+      return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 74: {
+      T t = 2*y100 - 149;
+      return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 75: {
+      T t = 2*y100 - 151;
+      return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 76: {
+      T t = 2*y100 - 153;
+      return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 77: {
+      T t = 2*y100 - 155;
+      return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 78: {
+      T t = 2*y100 - 157;
+      return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 79: {
+      T t = 2*y100 - 159;
+      return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 80: {
+      T t = 2*y100 - 161;
+      return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 81: {
+      T t = 2*y100 - 163;
+      return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 82: {
+      T t = 2*y100 - 165;
+      return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 83: {
+      T t = 2*y100 - 167;
+      return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 84: {
+      T t = 2*y100 - 169;
+      return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 85: {
+      T t = 2*y100 - 171;
+      return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 86: {
+      T t = 2*y100 - 173;
+      return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 87: {
+      T t = 2*y100 - 175;
+      return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 88: {
+      T t = 2*y100 - 177;
+      return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 89: {
+      T t = 2*y100 - 179;
+      return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 90: {
+      T t = 2*y100 - 181;
+      return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 91: {
+      T t = 2*y100 - 183;
+      return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 92: {
+      T t = 2*y100 - 185;
+      return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 93: {
+      T t = 2*y100 - 187;
+      return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 94: {
+      T t = 2*y100 - 189;
+      return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 95: {
+      T t = 2*y100 - 191;
+      return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 96: {
+      T t = 2*y100 - 193;
+      return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 97: {
+      T t = 2*y100 - 195;
+      return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 98: {
+      T t = 2*y100 - 197;
+      return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t;
+      }
+      case 99: {
+      T t = 2*y100 - 199;
+      return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t;
+      }
+    }
+
+    // we only get here if y = 1, i.e. |x| < 4*eps, in which case
+    // erfcx is within 1e-15 of 1..
+    return 1.;
+  }
+
+  template <typename T>
+  T erfcx(T x) {
+    // Short-circuits on NaN (returning NaN)
+    if (x != x) {
+      return x;
+    }
+
+    if (x >= 0) {
+      if (x > T{50}) { // continued-fraction expansion is faster
+        const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi)
+
+        if (x > T{5e7}) { // 1-term expansion, important to avoid overflow
+          return ispi / x;
+        }
+
+        /* 5-term expansion (rely on compiler for CSE), simplified from:
+                  ispi / (x+0.5/(x+1/(x+1.5/(x+2/x))))  */
+        return ispi * ((x*x) * (x*x+T{4.5}) + T{2}) / (x * ((x*x) * (x*x+T{5}) + T{3.75}));
+      }
+
+      // x >= 0 x <= 50
+      return erfcx_y100(T{400} / (T{4} + x));
+    }
+
+    // x < 0
+    if (x < T{-26.7}) {
+      return POS_INFINITY;
+    } else if (x < T{-6.1}) {
+      return T{2} * exp(x * x);
+    }
+
+    // x < 0 and x >= -6.1
+    return T{2} * exp(x * x) - erfcx_y100(T{400} / (T{4} - x));
+  }
+); // erfcx_string
+
+const auto airy_ai_string = jiterator_stringify(
+    template<typename T>
+    T airy_ai_forward(T x) {
+        static const T AN[] = {
+                +3.46538101525629032477e-01,
+                +1.20075952739645805542e+01,
+                +7.62796053615234516538e+01,
+                +1.68089224934630576269e+02,
+                +1.59756391350164413639e+02,
+                +7.05360906840444183113e+01,
+                +1.40264691163389668864e+01,
+                +9.99999999999999995305e-01,
+        };
+
+        static const T AD[] = {
+                +5.67594532638770212846e-01,
+                +1.47562562584847203173e+01,
+                +8.45138970141474626562e+01,
+                +1.77318088145400459522e+02,
+                +1.64234692871529701831e+02,
+                +7.14778400825575695274e+01,
+                +1.40959135607834029598e+01,
+                +1.00000000000000000470e+00,
+        };
+
+        static const T AFN[] = {
+                -1.31696323418331795333e-01,
+                -6.26456544431912369773e-01,
+                -6.93158036036933542233e-01,
+                -2.79779981545119124951e-01,
+                -4.91900132609500318020e-02,
+                -4.06265923594885404393e-03,
+                -1.59276496239262096340e-04,
+                -2.77649108155232920844e-06,
+                -1.67787698489114633780e-08,
+        };
+
+        static const T AFD[] = {
+                +1.33560420706553243746e+01,
+                +3.26825032795224613948e+01,
+                +2.67367040941499554804e+01,
+                +9.18707402907259625840e+00,
+                +1.47529146771666414581e+00,
+                +1.15687173795188044134e-01,
+                +4.40291641615211203805e-03,
+                +7.54720348287414296618e-05,
+                +4.51850092970580378464e-07,
+        };
+
+        static const T AGN[] = {
+                +1.97339932091685679179e-02,
+                +3.91103029615688277255e-01,
+                +1.06579897599595591108e+00,
+                +9.39169229816650230044e-01,
+                +3.51465656105547619242e-01,
+                +6.33888919628925490927e-02,
+                +5.85804113048388458567e-03,
+                +2.82851600836737019778e-04,
+                +6.98793669997260967291e-06,
+                +8.11789239554389293311e-08,
+                +3.41551784765923618484e-10,
+        };
+
+        static const T AGD[] = {
+                +9.30892908077441974853e+00,
+                +1.98352928718312140417e+01,
+                +1.55646628932864612953e+01,
+                +5.47686069422975497931e+00,
+                +9.54293611618961883998e-01,
+                +8.64580826352392193095e-02,
+                +4.12656523824222607191e-03,
+                +1.01259085116509135510e-04,
+                +1.17166733214413521882e-06,
+                +4.91834570062930015649e-09,
+        };
+
+        int domain_flag = 0;
+
+        T ai;
+
+        if (isinf(x)) {
+            return NAN;
+        }
+
+        if (x > T(103.892)) {
+            return T(0.0);
+        }
+
+        T f;
+        T g;
+        T k;
+
+        if (x < T(-2.09)) {
+            T z = T(1.0) / (T(-2.0) * x * sqrt(-x) / T(3.0));
+
+            T afn = 0.0;
+
+            for (uint8_t index = 0; index <= 8; index++) {
+                afn = afn * (z * z) + AFN[index];
+            }
+
+            T afd = 0.0;
+
+            for (uint8_t index = 0; index <= 8; index++) {
+                afd = afd * (z * z) + AFD[index];
+            }
+
+            T agn = 0.0;
+
+            for (uint8_t index = 0; index <= 10 + 0; index++) {
+                agn = agn * (z * z) + AGN[index];
+            }
+
+            T agd = 0.0;
+
+            for (uint8_t index = 0; index <= 10 - 1; index++) {
+                agd = agd * (z * z) + AGD[index];
+            }
+
+            T t = T(-2.0) * x * sqrt(-x) / T(3.0) + T(0.25) * T(3.14159265358979323846);
+
+            return T(5.64189583547756286948e-01) / sqrt(sqrt(-x)) * (sin(t) * (T(1.0) + z * z * afn / afd) - cos(t) * (z * agn / agd));
+        }
+
+        if (x >= T(2.09)) {
+            domain_flag = 5;
+
+            T zeta = T(2.0) * x * sqrt(x) / T(3.0);
+
+            T an = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                an = an * (T(1.0) / zeta) + AN[index];
+            }
+
+            T ad = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                ad = ad * (T(1.0) / zeta) + AD[index];
+            }
+
+            ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * sqrt(sqrt(x)) * exp(zeta));
+
+            if (x > T(8.3203353)) {
+                return ai;
+            }
+        }
+
+        f = 1.0;
+        g = x;
+        k = 1.0;
+
+        T m = 1.0;
+        T n = x;
+        T t = 1.0;
+        T z = x * x * x;
+
+        while (t > T(1.11022302462515654042e-16)) {
+            m *= z;
+            k += T(1.0);
+            m /= k;
+            n *= z;
+            k += T(1.0);
+            n /= k;
+            m /= k;
+            f += m;
+            k += T(1.0);
+            n /= k;
+            g += n;
+
+            t = abs(m / f);
+        }
+
+        if ((domain_flag & 1) == 0) {
+            return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g;
+        }
+
+        return ai;
+    } // T airy_ai(T x)
+); // airy_ai_string
+
+const auto bessel_j0_string = jiterator_stringify(
+    template<typename T>
+    T bessel_j0_forward(T x) {
+        static const T PP[] = {
+                +7.96936729297347051624e-04,
+                +8.28352392107440799803e-02,
+                +1.23953371646414299388e+00,
+                +5.44725003058768775090e+00,
+                +8.74716500199817011941e+00,
+                +5.30324038235394892183e+00,
+                +9.99999999999999997821e-01,
+        };
+
+        static const T PQ[] = {
+                +9.24408810558863637013e-04,
+                +8.56288474354474431428e-02,
+                +1.25352743901058953537e+00,
+                +5.47097740330417105182e+00,
+                +8.76190883237069594232e+00,
+                +5.30605288235394617618e+00,
+                +1.00000000000000000218e+00,
+        };
+
+        static const T QP[] = {
+                -1.13663838898469149931e-02,
+                -1.28252718670509318512e+00,
+                -1.95539544257735972385e+01,
+                -9.32060152123768231369e+01,
+                -1.77681167980488050595e+02,
+                -1.47077505154951170175e+02,
+                -5.14105326766599330220e+01,
+                -6.05014350600728481186e+00,
+        };
+
+        static const T QQ[] = {
+                +6.43178256118178023184e+01,
+                +8.56430025976980587198e+02,
+                +3.88240183605401609683e+03,
+                +7.24046774195652478189e+03,
+                +5.93072701187316984827e+03,
+                +2.06209331660327847417e+03,
+                +2.42005740240291393179e+02,
+        };
+
+        static const T RP[] = {
+                -4.79443220978201773821e+09,
+                +1.95617491946556577543e+12,
+                -2.49248344360967716204e+14,
+                +9.70862251047306323952e+15,
+        };
+
+        static const T RQ[] = {
+                +4.99563147152651017219e+02,
+                +1.73785401676374683123e+05,
+                +4.84409658339962045305e+07,
+                +1.11855537045356834862e+10,
+                +2.11277520115489217587e+12,
+                +3.10518229857422583814e+14,
+                +3.18121955943204943306e+16,
+                +1.71086294081043136091e+18,
+        };
+
+        if (x < T(0)) {
+            x = -x;
+        }
+
+        if (x <= T(5.0)) {
+            if (x < T(0.00001)) {
+                return T(1.0) - x * x / T(4.0);
+            }
+
+            T rp = 0.0;
+
+            for (uint8_t index = 0; index <= 3; index++) {
+                rp = rp * (x * x) + RP[index];
+            }
+
+            T rq = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                rq = rq * (x * x) + RQ[index];
+            }
+
+            return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq;
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(25.0) / (x * x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(25.0) / (x * x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(25.0) / (x * x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(25.0) / (x * x)) + QQ[index];
+        }
+
+        return (pp / pq * cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_j0_forward(T x)
+); // bessel_j0_string
+
+const auto bessel_y0_string = bessel_j0_string + jiterator_stringify(
+    template<typename T>
+    T bessel_y0_forward(T x) {
+        static const T PP[] = {
+                +7.96936729297347051624e-04,
+                +8.28352392107440799803e-02,
+                +1.23953371646414299388e+00,
+                +5.44725003058768775090e+00,
+                +8.74716500199817011941e+00,
+                +5.30324038235394892183e+00,
+                +9.99999999999999997821e-01,
+        };
+
+        static const T PQ[] = {
+                +9.24408810558863637013e-04,
+                +8.56288474354474431428e-02,
+                +1.25352743901058953537e+00,
+                +5.47097740330417105182e+00,
+                +8.76190883237069594232e+00,
+                +5.30605288235394617618e+00,
+                +1.00000000000000000218e+00,
+        };
+
+        static const T QP[] = {
+                -1.13663838898469149931e-02,
+                -1.28252718670509318512e+00,
+                -1.95539544257735972385e+01,
+                -9.32060152123768231369e+01,
+                -1.77681167980488050595e+02,
+                -1.47077505154951170175e+02,
+                -5.14105326766599330220e+01,
+                -6.05014350600728481186e+00,
+        };
+
+        static const T QQ[] = {
+                +6.43178256118178023184e+01,
+                +8.56430025976980587198e+02,
+                +3.88240183605401609683e+03,
+                +7.24046774195652478189e+03,
+                +5.93072701187316984827e+03,
+                +2.06209331660327847417e+03,
+                +2.42005740240291393179e+02,
+        };
+
+        static const T YP[] = {
+                +1.55924367855235737965e+04,
+                -1.46639295903971606143e+07,
+                +5.43526477051876500413e+09,
+                -9.82136065717911466409e+11,
+                +8.75906394395366999549e+13,
+                -3.46628303384729719441e+15,
+                +4.42733268572569800351e+16,
+                -1.84950800436986690637e+16,
+        };
+
+        static const T YQ[] = {
+                +1.04128353664259848412e+03,
+                +6.26107330137134956842e+05,
+                +2.68919633393814121987e+08,
+                +8.64002487103935000337e+10,
+                +2.02979612750105546709e+13,
+                +3.17157752842975028269e+15,
+                +2.50596256172653059228e+17,
+        };
+
+        if (x <= T(5.0)) {
+            if (x == T(0.0)) {
+                return NEG_INFINITY;
+            }
+
+            if (x < T(0.0)) {
+                NAN;
+            }
+
+            T yp = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                yp = yp * (x * x) + YP[index];
+            }
+
+            T yq = 0.0;
+
+            for (uint8_t index = 0; index <= 6; index++) {
+                yq = yq * (x * x) + YQ[index];
+            }
+
+            return yp / yq + (T(0.636619772367581343075535053490057448) * log(x) * bessel_j0_forward(x));
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(25.0) / (x * x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(25.0) / (x * x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(25.0) / (x * x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(25.0) / (x * x)) + QQ[index];
+        }
+
+        return (pp / pq * sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_y0_forward(T x)
+); // bessel_y0_string
+
+const auto bessel_j1_string = jiterator_stringify(
+    template<typename T>
+    T bessel_j1_forward(T x) {
+        static const T PP[] = {
+                +7.62125616208173112003e-04,
+                +7.31397056940917570436e-02,
+                +1.12719608129684925192e+00,
+                +5.11207951146807644818e+00,
+                +8.42404590141772420927e+00,
+                +5.21451598682361504063e+00,
+                +1.00000000000000000254e+00,
+        };
+
+        static const T PQ[] = {
+                +5.71323128072548699714e-04,
+                +6.88455908754495404082e-02,
+                +1.10514232634061696926e+00,
+                +5.07386386128601488557e+00,
+                +8.39985554327604159757e+00,
+                +5.20982848682361821619e+00,
+                +9.99999999999999997461e-01,
+        };
+
+        static const T QP[] = {
+                +5.10862594750176621635e-02,
+                +4.98213872951233449420e+00,
+                +7.58238284132545283818e+01,
+                +3.66779609360150777800e+02,
+                +7.10856304998926107277e+02,
+                +5.97489612400613639965e+02,
+                +2.11688757100572135698e+02,
+                +2.52070205858023719784e+01,
+        };
+
+        static const T QQ[] = {
+                +7.42373277035675149943e+01,
+                +1.05644886038262816351e+03,
+                +4.98641058337653607651e+03,
+                +9.56231892404756170795e+03,
+                +7.99704160447350683650e+03,
+                +2.82619278517639096600e+03,
+                +3.36093607810698293419e+02,
+        };
+
+        static const T RP[] = {
+                -8.99971225705559398224e+08,
+                +4.52228297998194034323e+11,
+                -7.27494245221818276015e+13,
+                +3.68295732863852883286e+15,
+        };
+
+        static const T RQ[] = {
+                +6.20836478118054335476e+02,
+                +2.56987256757748830383e+05,
+                +8.35146791431949253037e+07,
+                +2.21511595479792499675e+10,
+                +4.74914122079991414898e+12,
+                +7.84369607876235854894e+14,
+                +8.95222336184627338078e+16,
+                +5.32278620332680085395e+18,
+        };
+
+        if (x < T(0.0)) {
+            return -bessel_j1_forward(-x);
+        }
+
+        if (x <= T(5.0)) {
+            T rp = 0.0;
+
+            for (uint8_t index = 0; index <= 3; index++) {
+                rp = rp * (x * x) + RP[index];
+            }
+
+            T rq = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                rq = rq * (x * x) + RQ[index];
+            }
+
+            return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01));
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
+        }
+
+        return (pp / pq * cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_j1_forward(T x)
+); // bessel_j1_string
+
+const auto bessel_y1_string = bessel_j1_string + jiterator_stringify(
+    template<typename T>
+    T bessel_y1_forward(T x) {
+        static const T PP[] = {
+                +7.62125616208173112003e-04,
+                +7.31397056940917570436e-02,
+                +1.12719608129684925192e+00,
+                +5.11207951146807644818e+00,
+                +8.42404590141772420927e+00,
+                +5.21451598682361504063e+00,
+                +1.00000000000000000254e+00,
+        };
+
+        static const T PQ[] = {
+                +5.71323128072548699714e-04,
+                +6.88455908754495404082e-02,
+                +1.10514232634061696926e+00,
+                +5.07386386128601488557e+00,
+                +8.39985554327604159757e+00,
+                +5.20982848682361821619e+00,
+                +9.99999999999999997461e-01,
+        };
+
+        static const T QP[] = {
+                +5.10862594750176621635e-02,
+                +4.98213872951233449420e+00,
+                +7.58238284132545283818e+01,
+                +3.66779609360150777800e+02,
+                +7.10856304998926107277e+02,
+                +5.97489612400613639965e+02,
+                +2.11688757100572135698e+02,
+                +2.52070205858023719784e+01,
+        };
+
+        static const T QQ[] = {
+                +7.42373277035675149943e+01,
+                +1.05644886038262816351e+03,
+                +4.98641058337653607651e+03,
+                +9.56231892404756170795e+03,
+                +7.99704160447350683650e+03,
+                +2.82619278517639096600e+03,
+                +3.36093607810698293419e+02,
+        };
+
+        static const T YP[] = {
+                +1.26320474790178026440e+09,
+                -6.47355876379160291031e+11,
+                +1.14509511541823727583e+14,
+                -8.12770255501325109621e+15,
+                +2.02439475713594898196e+17,
+                -7.78877196265950026825e+17,
+        };
+
+        static const T YQ[] = {
+                +5.94301592346128195359e+02,
+                +2.35564092943068577943e+05,
+                +7.34811944459721705660e+07,
+                +1.87601316108706159478e+10,
+                +3.88231277496238566008e+12,
+                +6.20557727146953693363e+14,
+                +6.87141087355300489866e+16,
+                +3.97270608116560655612e+18,
+        };
+
+        if (x <= T(5.0)) {
+            if (x == T(0.0)) {
+                return NEG_INFINITY;
+            }
+
+            if (x <= T(0.0)) {
+                return NAN;
+            }
+
+            T yp = 0.0;
+
+            for (uint8_t index = 0; index <= 5; index++) {
+                yp = yp * (x * x) + YP[index];
+            }
+
+            T yq = 0.0;
+
+            for (uint8_t index = 0; index <= 7; index++) {
+                yq = yq * (x * x) + YQ[index];
+            }
+
+            return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * log(x) - T(1.0) / x));
+        }
+
+        T pp = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index];
+        }
+
+        T pq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index];
+        }
+
+        T qp = 0.0;
+
+        for (uint8_t index = 0; index <= 7; index++) {
+            qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index];
+        }
+
+        T qq = 0.0;
+
+        for (uint8_t index = 0; index <= 6; index++) {
+            qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index];
+        }
+
+        return (pp / pq * sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x);
+    } // bessel_y1_forward(T x)
+); // bessel_y1_string
+
+const auto chebyshev_polynomial_t_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_t_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0) || n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 6) && (abs(x) < T(1.0))) {
+            return cos(n * acos(x));
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_t_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_t_forward(T x, T n) {
+        return chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_t_forward(T x, T n)
+); // chebyshev_polynomial_t_string
+
+const auto chebyshev_polynomial_u_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_u_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0) || n % 2 == 0) {
+                return n + 1;
+            }
+
+            return -(n + 1);
+        }
+
+        if ((n > 8) && (abs(x) < T(1.0))) {
+            if (sin(acos(x)) != T(0.0)) {
+                return sin((n + 1) * acos(x)) / sin(acos(x));
+            }
+
+            return (n + 1) * cos((n + 1) * acos(x)) / x;
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x;
+        }
+
+        T p = T(1.0);
+        T q = x + x;
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_u_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_u_forward(T x, T n) {
+        return chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_u_forward(T x, T n)
+); // chebyshev_polynomial_u_string
+
+const auto chebyshev_polynomial_v_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_v_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0)) {
+                return T(1.0);
+            }
+
+            if (n % 2 == 0) {
+                return n + n + 1;
+            }
+
+            return -(n + n + 1);
+        }
+
+        if ((n > 8) && (abs(x) < T(1.0))) {
+            if (sin(acos(x) / T(2.0)) != T(1.0)) {
+                return cos((n + T(0.5)) * acos(x)) / cos(acos(x) / T(2.0));
+            }
+
+            if (n % 2 == 0) {
+                return n + n + 1;
+            }
+
+            return -(n + n + 1);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_v_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_v_forward(T x, T n) {
+        return chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_v_forward(T x, T n)
+); // chebyshev_polynomial_v_string
+
+const auto chebyshev_polynomial_w_string = jiterator_stringify(
+    template<typename T>
+    T chebyshev_polynomial_w_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0)) {
+                return n + n + 1;
+            }
+
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 8) && (abs(x) < T(1.0))) {
+            if (cos(acos(x) / T(2.0)) != T(1.0)) {
+                return sin((n + T(0.5)) * acos(x)) / sin(acos(x) / T(2.0));
+            }
+
+            if (x > T(0.0)) {
+                return n + n + 1;
+            }
+
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x + T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x + T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // chebyshev_polynomial_w_forward(T x, int64_t n)
+
+    template<typename T>
+    T chebyshev_polynomial_w_forward(T x, T n) {
+        return chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
+    } // chebyshev_polynomial_w_forward(T x, T n)
+); // chebyshev_polynomial_w_string
+
+const auto hermite_polynomial_h_string = jiterator_stringify(
+    template<typename T>
+    T hermite_polynomial_h_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x;
+        }
+
+        T p = T(1.0);
+        T q = x + x;
+        T r = T(0.0);
+
+        for (int64_t k = 2; k < n + n; k += 2) {
+            r = (x + x) * q - k * p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // hermite_polynomial_h_forward(T x, int64_t n)
+
+    template<typename T>
+    T hermite_polynomial_h_forward(T x, T n) {
+        return hermite_polynomial_h_forward(x, static_cast<int64_t>(n));
+    } // hermite_polynomial_h_forward(T x, T n)
+); // hermite_polynomial_h_string
+
+const auto hermite_polynomial_he_string = jiterator_stringify(
+    template<typename T>
+    T hermite_polynomial_he_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = x * q - k * p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // hermite_polynomial_he_forward(T x, int64_t n)
+
+    template<typename T>
+    T hermite_polynomial_he_forward(T x, T n) {
+        return hermite_polynomial_he_forward(x, static_cast<int64_t>(n));
+    } // hermite_polynomial_he_forward(T x, T n)
+); // hermite_polynomial_he_string
+
+const auto laguerre_polynomial_l_string = jiterator_stringify(
+    template<typename T>
+    T laguerre_polynomial_l_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(0.0)) {
+            return T(1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return T(1.0) - x;
+        }
+
+        T p = T(1.0);
+        T q = T(1.0) - x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // laguerre_polynomial_l_forward(T x, int64_t n)
+
+    template<typename T>
+    T laguerre_polynomial_l_forward(T x, T n) {
+        return laguerre_polynomial_l_forward(x, static_cast<int64_t>(n));
+    } // laguerre_polynomial_l_forward(T x, T n)
+); // laguerre_polynomial_l_string
+
+const auto legendre_polynomial_p_string = jiterator_stringify(
+    template<typename T>
+    T legendre_polynomial_p_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (abs(x) == T(1.0)) {
+            if (x > T(0.0) || n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x;
+        }
+
+        T p = T(1.0);
+        T q = x;
+        T r;
+
+        for (int64_t k = 1; k < n; k++) {
+            r = ((k + k + 1) * x * q - k * p) / (k + 1);
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // legendre_polynomial_p_forward(T x, int64_t n)
+
+    template<typename T>
+    T legendre_polynomial_p_forward(T x, T n) {
+        return legendre_polynomial_p_forward(x, static_cast<int64_t>(n));
+    } // legendre_polynomial_p_forward(T x, T n)
+); // legendre_polynomial_p_string
+
+const auto modified_bessel_i0_string = jiterator_stringify(
+    template<typename T>
+    T modified_bessel_i0_forward(T x) {
+        static const T A[] = {
+                -4.41534164647933937950e-18,
+                +3.33079451882223809783e-17,
+                -2.43127984654795469359e-16,
+                +1.71539128555513303061e-15,
+                -1.16853328779934516808e-14,
+                +7.67618549860493561688e-14,
+                -4.85644678311192946090e-13,
+                +2.95505266312963983461e-12,
+                -1.72682629144155570723e-11,
+                +9.67580903537323691224e-11,
+                -5.18979560163526290666e-10,
+                +2.65982372468238665035e-09,
+                -1.30002500998624804212e-08,
+                +6.04699502254191894932e-08,
+                -2.67079385394061173391e-07,
+                +1.11738753912010371815e-06,
+                -4.41673835845875056359e-06,
+                +1.64484480707288970893e-05,
+                -5.75419501008210370398e-05,
+                +1.88502885095841655729e-04,
+                -5.76375574538582365885e-04,
+                +1.63947561694133579842e-03,
+                -4.32430999505057594430e-03,
+                +1.05464603945949983183e-02,
+                -2.37374148058994688156e-02,
+                +4.93052842396707084878e-02,
+                -9.49010970480476444210e-02,
+                +1.71620901522208775349e-01,
+                -3.04682672343198398683e-01,
+                +6.76795274409476084995e-01,
+        };
+
+        static const T B[] = {
+                -7.23318048787475395456e-18,
+                -4.83050448594418207126e-18,
+                +4.46562142029675999901e-17,
+                +3.46122286769746109310e-17,
+                -2.82762398051658348494e-16,
+                -3.42548561967721913462e-16,
+                +1.77256013305652638360e-15,
+                +3.81168066935262242075e-15,
+                -9.55484669882830764870e-15,
+                -4.15056934728722208663e-14,
+                +1.54008621752140982691e-14,
+                +3.85277838274214270114e-13,
+                +7.18012445138366623367e-13,
+                -1.79417853150680611778e-12,
+                -1.32158118404477131188e-11,
+                -3.14991652796324136454e-11,
+                +1.18891471078464383424e-11,
+                +4.94060238822496958910e-10,
+                +3.39623202570838634515e-09,
+                +2.26666899049817806459e-08,
+                +2.04891858946906374183e-07,
+                +2.89137052083475648297e-06,
+                +6.88975834691682398426e-05,
+                +3.36911647825569408990e-03,
+                +8.04490411014108831608e-01,
+        };
+
+        T p;
+        T q = 0.0;
+
+        if (abs(x) <= T(8.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 30; index++) {
+                p = q;
+                q = a;
+                a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
+            }
+
+            return exp(abs(x)) * (T(0.5) * (a - p));
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index];
+        }
+
+        return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x));
+    } // modified_bessel_i0_forward(T x)
+); // modified_bessel_i0_string
+
+const auto modified_bessel_i1_string = jiterator_stringify(
+    template<typename T>
+    T modified_bessel_i1_forward(T x) {
+        static const T A[] = {
+                +2.77791411276104639959e-18,
+                -2.11142121435816608115e-17,
+                +1.55363195773620046921e-16,
+                -1.10559694773538630805e-15,
+                +7.60068429473540693410e-15,
+                -5.04218550472791168711e-14,
+                +3.22379336594557470981e-13,
+                -1.98397439776494371520e-12,
+                +1.17361862988909016308e-11,
+                -6.66348972350202774223e-11,
+                +3.62559028155211703701e-10,
+                -1.88724975172282928790e-09,
+                +9.38153738649577178388e-09,
+                -4.44505912879632808065e-08,
+                +2.00329475355213526229e-07,
+                -8.56872026469545474066e-07,
+                +3.47025130813767847674e-06,
+                -1.32731636560394358279e-05,
+                +4.78156510755005422638e-05,
+                -1.61760815825896745588e-04,
+                +5.12285956168575772895e-04,
+                -1.51357245063125314899e-03,
+                +4.15642294431288815669e-03,
+                -1.05640848946261981558e-02,
+                +2.47264490306265168283e-02,
+                -5.29459812080949914269e-02,
+                +1.02643658689847095384e-01,
+                -1.76416518357834055153e-01,
+                +2.52587186443633654823e-01,
+        };
+
+        static const T B[] = {
+                +7.51729631084210481353e-18,
+                +4.41434832307170791151e-18,
+                -4.65030536848935832153e-17,
+                -3.20952592199342395980e-17,
+                +2.96262899764595013876e-16,
+                +3.30820231092092828324e-16,
+                -1.88035477551078244854e-15,
+                -3.81440307243700780478e-15,
+                +1.04202769841288027642e-14,
+                +4.27244001671195135429e-14,
+                -2.10154184277266431302e-14,
+                -4.08355111109219731823e-13,
+                -7.19855177624590851209e-13,
+                +2.03562854414708950722e-12,
+                +1.41258074366137813316e-11,
+                +3.25260358301548823856e-11,
+                -1.89749581235054123450e-11,
+                -5.58974346219658380687e-10,
+                -3.83538038596423702205e-09,
+                -2.63146884688951950684e-08,
+                -2.51223623787020892529e-07,
+                -3.88256480887769039346e-06,
+                -1.10588938762623716291e-04,
+                -9.76109749136146840777e-03,
+                +7.78576235018280120474e-01,
+        };
+
+        T p;
+        T q = 0.0;
+
+        if (abs(x) <= T(8.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 29; index++) {
+                p = q;
+                q = a;
+                a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index];
+            }
+
+            if (x < T(0.0)) {
+                return -(T(0.5) * (a - p) * abs(x) * exp(abs(x)));
+            }
+
+            return T(0.5) * (a - p) * abs(x) * exp(abs(x));
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index];
+        }
+
+        if (x < T(0.0)) {
+            return -(exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)));
+        }
+
+        return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x));
+    } // modified_bessel_i1_forward(T x)
+); // modified_bessel_i1_string
+
+const auto modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify(
+    template<typename T>
+    T modified_bessel_k0_forward(T x) {
+        static const T A[] = {
+                +1.37446543561352307156e-16,
+                +4.25981614279661018399e-14,
+                +1.03496952576338420167e-11,
+                +1.90451637722020886025e-09,
+                +2.53479107902614945675e-07,
+                +2.28621210311945178607e-05,
+                +1.26461541144692592338e-03,
+                +3.59799365153615016266e-02,
+                +3.44289899924628486886e-01,
+                -5.35327393233902768720e-01,
+        };
+
+        static const T B[] = {
+                +5.30043377268626276149e-18,
+                -1.64758043015242134646e-17,
+                +5.21039150503902756861e-17,
+                -1.67823109680541210385e-16,
+                +5.51205597852431940784e-16,
+                -1.84859337734377901440e-15,
+                +6.34007647740507060557e-15,
+                -2.22751332699166985548e-14,
+                +8.03289077536357521100e-14,
+                -2.98009692317273043925e-13,
+                +1.14034058820847496303e-12,
+                -4.51459788337394416547e-12,
+                +1.85594911495471785253e-11,
+                -7.95748924447710747776e-11,
+                +3.57739728140030116597e-10,
+                -1.69753450938905987466e-09,
+                +8.57403401741422608519e-09,
+                -4.66048989768794782956e-08,
+                +2.76681363944501510342e-07,
+                -1.83175552271911948767e-06,
+                +1.39498137188764993662e-05,
+                -1.28495495816278026384e-04,
+                +1.56988388573005337491e-03,
+                -3.14481013119645005427e-02,
+                +2.44030308206595545468e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 10; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return T(0.5) * (a - p) - log(0.5 * x) * modified_bessel_i0_forward(x);
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return exp(-x) * (T(0.5) * (b - p)) / sqrt(x);
+    } // modified_bessel_k0_forward(T x)
+); // modified_bessel_k0_string
+
+const auto scaled_modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify(
+    template<typename T>
+    T scaled_modified_bessel_k0_forward(T x) {
+        static const T A[] = {
+                +1.37446543561352307156e-16,
+                +4.25981614279661018399e-14,
+                +1.03496952576338420167e-11,
+                +1.90451637722020886025e-09,
+                +2.53479107902614945675e-07,
+                +2.28621210311945178607e-05,
+                +1.26461541144692592338e-03,
+                +3.59799365153615016266e-02,
+                +3.44289899924628486886e-01,
+                -5.35327393233902768720e-01,
+        };
+
+        static const T B[] = {
+                +5.30043377268626276149e-18,
+                -1.64758043015242134646e-17,
+                +5.21039150503902756861e-17,
+                -1.67823109680541210385e-16,
+                +5.51205597852431940784e-16,
+                -1.84859337734377901440e-15,
+                +6.34007647740507060557e-15,
+                -2.22751332699166985548e-14,
+                +8.03289077536357521100e-14,
+                -2.98009692317273043925e-13,
+                +1.14034058820847496303e-12,
+                -4.51459788337394416547e-12,
+                +1.85594911495471785253e-11,
+                -7.95748924447710747776e-11,
+                +3.57739728140030116597e-10,
+                -1.69753450938905987466e-09,
+                +8.57403401741422608519e-09,
+                -4.66048989768794782956e-08,
+                +2.76681363944501510342e-07,
+                -1.83175552271911948767e-06,
+                +1.39498137188764993662e-05,
+                -1.28495495816278026384e-04,
+                +1.56988388573005337491e-03,
+                -3.14481013119645005427e-02,
+                +2.44030308206595545468e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 10; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return (T(0.5) * (a - p) - log(T(0.5) * x) * modified_bessel_i0_forward(x)) * exp(x);
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return T(0.5) * (b - p) / sqrt(x);
+    } // T scaled_modified_bessel_k0_forward(T x)
+); // scaled_modified_bessel_k0_string
+
+const auto modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify(
+    template<typename T>
+    T modified_bessel_k1_forward(T x) {
+        static const T A[] = {
+                -7.02386347938628759343e-18,
+                -2.42744985051936593393e-15,
+                -6.66690169419932900609e-13,
+                -1.41148839263352776110e-10,
+                -2.21338763073472585583e-08,
+                -2.43340614156596823496e-06,
+                -1.73028895751305206302e-04,
+                -6.97572385963986435018e-03,
+                -1.22611180822657148235e-01,
+                -3.53155960776544875667e-01,
+                +1.52530022733894777053e+00,
+        };
+
+        static const T B[] = {
+                -5.75674448366501715755e-18,
+                +1.79405087314755922667e-17,
+                -5.68946255844285935196e-17,
+                +1.83809354436663880070e-16,
+                -6.05704724837331885336e-16,
+                +2.03870316562433424052e-15,
+                -7.01983709041831346144e-15,
+                +2.47715442448130437068e-14,
+                -8.97670518232499435011e-14,
+                +3.34841966607842919884e-13,
+                -1.28917396095102890680e-12,
+                +5.13963967348173025100e-12,
+                -2.12996783842756842877e-11,
+                +9.21831518760500529508e-11,
+                -4.19035475934189648750e-10,
+                +2.01504975519703286596e-09,
+                -1.03457624656780970260e-08,
+                +5.74108412545004946722e-08,
+                -3.50196060308781257119e-07,
+                +2.40648494783721712015e-06,
+                -1.93619797416608296024e-05,
+                +1.95215518471351631108e-04,
+                -2.85781685962277938680e-03,
+                +1.03923736576817238437e-01,
+                +2.72062619048444266945e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 11; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x;
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return exp(-x) * (T(0.5) * (b - p)) / sqrt(x);
+    } // modified_bessel_k1_forward(T x)
+); // modified_bessel_k1_string
+
+const auto scaled_modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify(
+    template<typename T>
+    T scaled_modified_bessel_k1_forward(T x) {
+        static const T A[] = {
+                -7.02386347938628759343e-18,
+                -2.42744985051936593393e-15,
+                -6.66690169419932900609e-13,
+                -1.41148839263352776110e-10,
+                -2.21338763073472585583e-08,
+                -2.43340614156596823496e-06,
+                -1.73028895751305206302e-04,
+                -6.97572385963986435018e-03,
+                -1.22611180822657148235e-01,
+                -3.53155960776544875667e-01,
+                +1.52530022733894777053e+00,
+        };
+
+        static const T B[] = {
+                -5.75674448366501715755e-18,
+                +1.79405087314755922667e-17,
+                -5.68946255844285935196e-17,
+                +1.83809354436663880070e-16,
+                -6.05704724837331885336e-16,
+                +2.03870316562433424052e-15,
+                -7.01983709041831346144e-15,
+                +2.47715442448130437068e-14,
+                -8.97670518232499435011e-14,
+                +3.34841966607842919884e-13,
+                -1.28917396095102890680e-12,
+                +5.13963967348173025100e-12,
+                -2.12996783842756842877e-11,
+                +9.21831518760500529508e-11,
+                -4.19035475934189648750e-10,
+                +2.01504975519703286596e-09,
+                -1.03457624656780970260e-08,
+                +5.74108412545004946722e-08,
+                -3.50196060308781257119e-07,
+                +2.40648494783721712015e-06,
+                -1.93619797416608296024e-05,
+                +1.95215518471351631108e-04,
+                -2.85781685962277938680e-03,
+                +1.03923736576817238437e-01,
+                +2.72062619048444266945e+00,
+        };
+
+        if (x == T(0.0)) {
+            return INFINITY;
+        }
+
+        if (x < T(0.0)) {
+            return NAN;
+        }
+
+        T p;
+        T q = 0.0;
+
+        if (x <= T(2.0)) {
+            T a = A[0];
+
+            for (uint8_t index = 1; index < 11; index++) {
+                p = q;
+                q = a;
+                a = (x * x - T(2.0)) * q - p + A[index];
+            }
+
+            return (log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * exp(x);
+        }
+
+        T b = B[0];
+
+        for (uint8_t index = 1; index < 25; index++) {
+            p = q;
+            q = b;
+            b = (T(8.0) / x - T(2.0)) * q - p + B[index];
+        }
+
+        return (T(0.5) * (b - p) / sqrt(x));
+    } // T scaled_modified_bessel_k1_forward(T x)
+); // scaled_modified_bessel_k1_string
+
+const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return T(1.0);
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) {
+            return cos(n * acos(x + x - T(1.0)));
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_t_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_t_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_t_forward(T x, T n)
+); // shifted_chebyshev_polynomial_t_string
+
+const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return n + 1;
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return n + 1;
+            }
+
+            return -(n + 1);
+        }
+
+        if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) {
+            if (sin(acos(x + x - T(1.0))) != T(0.0)) {
+                return sin((n + 1) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)));
+            }
+
+            return (n + 1) * cos((n + 1) * acos(x + x - T(1.0))) / (x + x - T(1.0));
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0) + (x + x - T(1.0));
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0) + (x + x - T(1.0));
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_u_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_u_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_u_forward(T x, T n)
+); // shifted_chebyshev_polynomial_u_string
+
+const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return T(1.0);
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return (n + n + 1);
+            }
+
+            return -(n + n + 1);
+        }
+
+        if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) {
+            if (sin(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
+                return cos(((n) + T(0.5)) * acos(x + x - T(1.0))) / cos(acos(x + x - T(1.0)) / T(2.0));
+            }
+
+            if (n % 2 == 0) {
+                return n + n + 1;
+            }
+
+            return -(n + n + 1);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_v_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_v_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_v_forward(T x, T n)
+); // shifted_chebyshev_polynomial_v_string
+
+const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify(
+    template<typename T>
+    T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) {
+        if (n < 0) {
+            return T(0.0);
+        }
+
+        if (x == T(1.0)) {
+            return n + n + 1;
+        }
+
+        if (x == T(0.0)) {
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if ((n > 4) && (abs(x + x - T(1.0)) < T(1.0))) {
+            if (cos(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) {
+                return sin((n + T(0.5)) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)) / T(2.0));
+            }
+
+            if (n % 2 == 0) {
+                return T(1.0);
+            }
+
+            return T(-1.0);
+        }
+
+        if (n == 0) {
+            return T(1.0);
+        }
+
+        if (n == 1) {
+            return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
+        }
+
+        T p = T(1.0);
+        T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
+        T r;
+
+        for (int64_t k = 2; k <= n; k++) {
+            r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
+            p = q;
+            q = r;
+        }
+
+        return r;
+    } // shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
+
+    template<typename T>
+    T shifted_chebyshev_polynomial_w_forward(T x, T n) {
+        return shifted_chebyshev_polynomial_w_forward(x, static_cast<int64_t>(n));
+    } // shifted_chebyshev_polynomial_w_forward(T x, T n)
+); // shifted_chebyshev_polynomial_w_string
+
+const auto spherical_bessel_j0_string = jiterator_stringify(
+    template<typename T>
+    T spherical_bessel_j0_forward(T x) {
+        if (isinf(x)) {
+            return T(0.0);
+        }
+
+        if (abs(x) < T(0.5)) {
+            return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0)))))));
+        }
+
+        return sin(x) / x;
+    } // T spherical_bessel_j0_forward(T x)
+); // spherical_bessel_j0_string
+
+#else // !AT_USE_JITERATOR() -- kernels must be precompiled
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_gcd(scalar_t a_in, scalar_t b_in) {
+  scalar_t a = ::abs(a_in);
+  scalar_t b = ::abs(b_in);
+  while (a != 0) {
+    scalar_t c = a;
+    a = b % a;
+    b = c;
+  }
+  return b;
+}
+
+/*
+ * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h".
+ */
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) {
+  // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  static const double PI_f64 = 3.14159265358979323846;
+  const accscalar_t PSI_10 = 2.25175258906672110764;
+  const accscalar_t A[] = {
+      8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+      7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2,
+  };
+
+  accscalar_t x = static_cast<accscalar_t>(in);
+  if (x == 0) {
+    // As per C++ standard for gamma related functions and SciPy,
+    // If the argument is ±0, ±∞ is returned
+    return std::copysign(static_cast<scalar_t>(INFINITY), -x);
+  }
+
+  bool x_is_integer = x == ::trunc(x);
+  accscalar_t result = 0;
+  if (x < 0) {
+    if (x_is_integer) {
+      // As per C++ standard for gamma related functions and SciPy,
+      // If the argument is a negative integer, NaN is returned
+      return static_cast<scalar_t>(NAN);
+    }
+    // Extracts the fractional part of x as r, since tan(pi * r) is more numerically
+    // accurate than tan(pi * x). While these operations are mathematically equivalent
+    // since both x and r are in radians and tan() has a periodicity of pi, in practice
+    // the computation of pi * x is a source of error (when |x| > 1).
+    double q, r;
+    r = ::modf(static_cast<double>(x), &q);
+    result = static_cast<accscalar_t>(- PI_f64 / ::tan(PI_f64 * r));
+    x = 1 - x;
+  }
+
+  while (x < 10) {
+    result -= 1 / x;
+    x += 1;
+  }
+  if (x == 10) {
+    return static_cast<scalar_t>(result + PSI_10);
+  }
+
+  accscalar_t y = 0;
+  if (x < 1.0e17) {
+    accscalar_t z = 1 / (x * x);
+
+    accscalar_t polevl_result = 0;
+    for (int i = 0; i <= 6; i++) {
+      polevl_result = polevl_result * z + A[i];
+    }
+    y = z * polevl_result;
+  }
+
+  return static_cast<scalar_t>(::log(x) - (static_cast<accscalar_t>(0.5) / x) - y + result);
+}
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) {
+  using accscalar_t = at::acc_type<scalar_t, /*is_cuda=*/true>;
+  const accscalar_t PI = 3.14159265358979323846;
+  accscalar_t x = static_cast<accscalar_t>(in);
+  accscalar_t sign = +1;
+  accscalar_t result = 0;
+  if (x < 0.5f) {
+    sign = -1;
+    accscalar_t sin_pi_x = ::sin(PI * x);
+    result -= (PI * PI) / (sin_pi_x * sin_pi_x);
+    x = 1 - x;
+  }
+  for (int i = 0; i < 6; ++i) {
+    result += 1 / (x * x);
+    x += 1;
+  }
+  const accscalar_t one = static_cast<scalar_t>(1);
+  const accscalar_t ixx = 1 / (x*x);
+  result += (1 + 1 / (2*x) + ixx * (one/6 - ixx * (one/30 - ixx * (one/42)))) / x;
+  return static_cast<scalar_t>(sign * result);
+}
+
+/*
+ * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h".
+ */
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t
+chbevl(scalar_t _x, const scalar_t array[], size_t len) {
+  static_assert(!std::is_same<scalar_t, Half>() && !std::is_same<scalar_t, BFloat16>(), "don't instantiate with low precision type");
+
+  scalar_t b0, b1, b2;
+
+  b0 = array[0];
+  b1 = 0;
+
+  for (size_t i = 1; i < len; ++i)  {
+    b2 = b1;
+    b1 = b0;
+    b0 = _x * b1 - b2 + array[i];
+  }
+
+  return (0.5 * (b0 - b2));
+}
+
+/*
+ * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h".
+ */
+template <typename T>
+C10_HOST_DEVICE inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_A() {
+  /* Chebyshev coefficients for exp(-x) I0(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I0(x) } = 1.
+   */
+  static const T coefficients[] = {
+      -4.41534164647933937950E-18, 3.33079451882223809783E-17,
+      -2.43127984654795469359E-16, 1.71539128555513303061E-15,
+      -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+      -4.85644678311192946090E-13, 2.95505266312963983461E-12,
+      -1.72682629144155570723E-11, 9.67580903537323691224E-11,
+      -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+      -1.30002500998624804212E-8,  6.04699502254191894932E-8,
+      -2.67079385394061173391E-7,  1.11738753912010371815E-6,
+      -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+      -5.75419501008210370398E-5,  1.88502885095841655729E-4,
+      -5.76375574538582365885E-4,  1.63947561694133579842E-3,
+      -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+      -2.37374148058994688156E-2,  4.93052842396707084878E-2,
+      -9.49010970480476444210E-2,  1.71620901522208775349E-1,
+      -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+
+  return std::make_tuple(coefficients, 30);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline std::tuple<const T*, size_t> chebyshev_coefficients_i0e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi).
+   */
+  static const T coefficients[] = {
+      -7.23318048787475395456E-18, -4.83050448594418207126E-18,
+      4.46562142029675999901E-17,  3.46122286769746109310E-17,
+      -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+      1.77256013305652638360E-15,  3.81168066935262242075E-15,
+      -9.55484669882830764870E-15, -4.15056934728722208663E-14,
+      1.54008621752140982691E-14,  3.85277838274214270114E-13,
+      7.18012445138366623367E-13,  -1.79417853150680611778E-12,
+      -1.32158118404477131188E-11, -3.14991652796324136454E-11,
+      1.18891471078464383424E-11,  4.94060238822496958910E-10,
+      3.39623202570838634515E-9,   2.26666899049817806459E-8,
+      2.04891858946906374183E-7,   2.89137052083475648297E-6,
+      6.88975834691682398426E-5,   3.36911647825569408990E-3,
+      8.04490411014108831608E-1};
+
+  return std::make_tuple(coefficients, 25);
+}
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) {
+  static_assert(!std::is_same<scalar_t, Half>() && !std::is_same<scalar_t, BFloat16>(), "don't instantiate with low precision type");
+  // Upcast input for numerical accuracy purposes
+  // Needed for accurate results if input is bfloat16 or float16
+  scalar_t x = ::abs(_x);
+
+  if (x <= scalar_t{8.0}) {
+    auto coeff_pair = chebyshev_coefficients_i0e_A<scalar_t>();
+    auto A = std::get<0>(coeff_pair);
+    auto len = std::get<1>(coeff_pair);
+    scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0};
+    return (::exp(x) * chbevl(y, A, len));
+  }
+
+  auto coeff_pair = chebyshev_coefficients_i0e_B<scalar_t>();
+  auto B = std::get<0>(coeff_pair);
+  auto len = std::get<1>(coeff_pair);
+  return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x));
+}
+
+template <typename T>
+C10_HOST_DEVICE inline
+    typename std::enable_if<std::is_same<double, T>::value, std::tuple<const T*, size_t>>::type
+    chebyshev_coefficients_i1e_A() {
+  /* Chebyshev coefficients for exp(-x) I1(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I1(x) / x } = 1/2.
+   */
+  static const T coefficients[] = {
+      2.77791411276104639959E-18, -2.11142121435816608115E-17,
+      1.55363195773620046921E-16, -1.10559694773538630805E-15,
+      7.60068429473540693410E-15, -5.04218550472791168711E-14,
+      3.22379336594557470981E-13, -1.98397439776494371520E-12,
+      1.17361862988909016308E-11, -6.66348972350202774223E-11,
+      3.62559028155211703701E-10, -1.88724975172282928790E-9,
+      9.38153738649577178388E-9,  -4.44505912879632808065E-8,
+      2.00329475355213526229E-7,  -8.56872026469545474066E-7,
+      3.47025130813767847674E-6,  -1.32731636560394358279E-5,
+      4.78156510755005422638E-5,  -1.61760815825896745588E-4,
+      5.12285956168575772895E-4,  -1.51357245063125314899E-3,
+      4.15642294431288815669E-3,  -1.05640848946261981558E-2,
+      2.47264490306265168283E-2,  -5.29459812080949914269E-2,
+      1.02643658689847095384E-1,  -1.76416518357834055153E-1,
+      2.52587186443633654823E-1};
+
+  return std::make_tuple(coefficients, 29);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline
+    typename std::enable_if<std::is_same<float, T>::value, std::tuple<const T*, size_t>>::type
+    chebyshev_coefficients_i1e_A() {
+  /* Chebyshev coefficients for exp(-x) I1(x)
+   * in the interval [0,8].
+   *
+   * lim(x->0){ exp(-x) I1(x) / x } = 1/2.
+   */
+  static const T coeff[] = {
+      9.38153738649577178388E-9f,
+      -4.44505912879632808065E-8f,
+      2.00329475355213526229E-7f,
+      -8.56872026469545474066E-7f,
+      3.47025130813767847674E-6f,
+      -1.32731636560394358279E-5f,
+      4.78156510755005422638E-5f,
+      -1.61760815825896745588E-4f,
+      5.12285956168575772895E-4f,
+      -1.51357245063125314899E-3f,
+      4.15642294431288815669E-3f,
+      -1.05640848946261981558E-2f,
+      2.47264490306265168283E-2f,
+      -5.29459812080949914269E-2f,
+      1.02643658689847095384E-1f,
+      -1.76416518357834055153E-1f,
+      2.52587186443633654823E-1f};
+  return std::make_tuple(coeff, 17);
+};
+
+template <typename T>
+C10_HOST_DEVICE inline
+    typename std::enable_if<std::is_same<double, T>::value, std::tuple<const T*, size_t>>::type
+    chebyshev_coefficients_i1e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi).
+   */
+  static const T coefficients[] = {
+      7.51729631084210481353E-18,  4.41434832307170791151E-18,
+      -4.65030536848935832153E-17, -3.20952592199342395980E-17,
+      2.96262899764595013876E-16,  3.30820231092092828324E-16,
+      -1.88035477551078244854E-15, -3.81440307243700780478E-15,
+      1.04202769841288027642E-14,  4.27244001671195135429E-14,
+      -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+      -7.19855177624590851209E-13, 2.03562854414708950722E-12,
+      1.41258074366137813316E-11,  3.25260358301548823856E-11,
+      -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+      -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
+      -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
+      -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+      7.78576235018280120474E-1};
+
+  return std::make_tuple(coefficients, 25);
+}
+
+template <typename T>
+C10_HOST_DEVICE inline
+    typename std::enable_if<std::is_same<float, T>::value, std::tuple<const T*, size_t>>::type
+    chebyshev_coefficients_i1e_B() {
+  /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x)
+   * in the inverted interval [8,infinity].
+   *
+   * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi).
+   */
+  static const T coeff[] = {
+      -3.83538038596423702205E-9f,
+      -2.63146884688951950684E-8f,
+      -2.51223623787020892529E-7f,
+      -3.88256480887769039346E-6f,
+      -1.10588938762623716291E-4f,
+      -9.76109749136146840777E-3f,
+      7.78576235018280120474E-1f};
+
+  return std::make_tuple(coeff, 7);
+};
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_i1(scalar_t _x) {
+  const auto x = ::abs(_x);
+  if (x <= scalar_t{8.0}) {
+    auto coeff_pair = chebyshev_coefficients_i1e_A<scalar_t>();
+    auto A = std::get<0>(coeff_pair);
+    auto len = std::get<1>(coeff_pair);
+    scalar_t y = x / scalar_t{2.0} - scalar_t{2.0};
+    const scalar_t out = ::exp(x) * x * chbevl(y, A, len);
+    return (_x < scalar_t{0.0}) ? -out : out;
+  }
+
+  auto coeff_pair = chebyshev_coefficients_i1e_B<scalar_t>();
+  auto B = std::get<0>(coeff_pair);
+  auto len = std::get<1>(coeff_pair);
+  const scalar_t out = (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len)) / ::sqrt(x);
+  return (_x < scalar_t{0.0}) ? -out : out;
+}
+
+template <typename scalar_t>
+static inline C10_HOST_DEVICE scalar_t calc_i1e(scalar_t _x) {
+  const auto x = ::abs(_x);
+  if (x <= scalar_t{8.0}) {
+    auto coeff_pair = chebyshev_coefficients_i1e_A<scalar_t>();
+    auto A = std::get<0>(coeff_pair);
+    auto len = std::get<1>(coeff_pair);
+    const scalar_t y = x / scalar_t{2.0} - scalar_t{2.0};
+    const scalar_t out = chbevl(y, A, len) * x;
+    return (_x < scalar_t{0.0}) ? -out : out;
+  }
+
+  auto coeff_pair = chebyshev_coefficients_i1e_B<scalar_t>();
+  auto B = std::get<0>(coeff_pair);
+  auto len = std::get<1>(coeff_pair);
+  const scalar_t out = chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x);
+  return (_x < scalar_t{0.0}) ? -out : out;
+}
+
+#endif // AT_USE_JITERATOR() (this closes the "else" branch of a if/else preprocessor directive)
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4553276bab684eca9d858bc81ae8f34c9445afa5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/PersistentSoftmax.cuh
@@ -0,0 +1,401 @@
+#pragma once
+
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <cuda_fp16.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/cuda/DeviceUtils.cuh>
+
+namespace {
+
+int log2_ceil(int value) {
+    int log2_value = 0;
+    while ((1 << log2_value) < value) ++log2_value;
+    return log2_value;
+}
+
+template<typename T>
+struct Add {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a + b;
+  }
+};
+
+template<typename T>
+struct Max {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename acc_t, int WARP_BATCH, int WARP_SIZE, template<typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(acc_t* sum) {
+    ReduceOp<acc_t> r;
+    #pragma unroll
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        #pragma unroll
+        for (int i = 0;  i < WARP_BATCH;  ++i) {
+            acc_t b = WARP_SHFL_XOR(sum[i], offset, WARP_SIZE);
+            sum[i] = r(sum[i], b);
+        }
+    }
+}
+
+// The softmax_warp_* methods perform softmax forward and backward propagation on samples spanning the fast dimension.
+// Each sample contains element_count scalar elements. element_count can be any integer value <= 1024.
+// The template arguments have the following meaning:
+// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples.
+// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small.
+// A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp.
+// This is important because it means only __shfl_ instructions are required for reductions.
+// Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
+// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch.
+// ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs.
+// is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed.
+// is_masked is a flag indicating whether SoftMax or MaskedSoftMax should be computed.
+// The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t.
+// This allows SoftMax to be fused with a cast immediately following the SoftMax.
+// The mask should have the same shape as input, with a boolean indicate if the value is masked.
+// The head_chunk_size is only used for transformer mask softmax, equals to H * D * D.
+// For instance:
+// input_t=half,  acc_t=float, output_t=half  => read half tensor, float accumulators, write half tensor.
+// input_t=half,  acc_t=float, output_t=float => read half tensor, float accumulators, write float tensor.
+// input_t_float, acc_t=float, output_t=half  => read float tensor, float accumulators, write half tensor.
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked>
+__global__ void softmax_warp_forward(output_t *dst, const input_t *src, int batch_size, int stride, int element_count, const bool *mask = nullptr, const int head_chunk_size = -1, bool is_transformer_mask = false)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x;
+    int idx_offset = first_batch * stride + local_idx;
+
+    src += idx_offset;
+    dst += idx_offset;
+
+    if (is_transformer_mask) {
+        mask += ((first_batch * stride) / head_chunk_size) * stride + local_idx;
+    } else {
+        mask += idx_offset;
+    }
+    // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
+    // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
+    // the nested loops.
+    // This should have no impact on performance because the loops are unrolled anyway.
+
+    // load data from global memory
+    acc_t elements[WARP_BATCH][WARP_ITERATIONS];
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                elements[i][it] = src[i*element_count+it*WARP_SIZE];
+            } else {
+                elements[i][it] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+
+    // compute max_value
+    acc_t max_value[WARP_BATCH];
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        bool is_meaningful_max = false;
+        max_value[i] = elements[i][0];
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (is_masked) {
+                int idx = it*WARP_SIZE;
+                if ((idx + local_idx) < batch_element_count) {
+                    if (!is_transformer_mask) {
+                        idx += i*element_count;
+                    }
+                    if (!mask[idx]) {
+                        max_value[i] = (is_meaningful_max && max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+                        is_meaningful_max = true;
+                    }
+                }
+            } else {
+                max_value[i] = max_value[i] > elements[i][it] ? max_value[i] : elements[i][it];
+            }
+        }
+        if (is_masked) {
+            if (!is_meaningful_max) {
+                max_value[i] = -std::numeric_limits<acc_t>::infinity();
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Max>(max_value);
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (!is_masked) {
+                if (is_log_softmax) {
+                    sum[i] += std::exp(elements[i][it] - max_value[i]);
+                } else {
+                    elements[i][it] = std::exp(elements[i][it] - max_value[i]);
+                    sum[i] += elements[i][it];
+                }
+            } else {
+                int idx = it*WARP_SIZE;
+                bool valid = (idx + local_idx) < batch_element_count;
+                if (!is_transformer_mask) {
+                    idx += i*element_count;
+                }
+                if (valid) {
+                    if (!mask[idx]) {
+                        if (is_log_softmax) {
+                            sum[i] += std::exp(elements[i][it] - max_value[i]);
+                        } else {
+                            elements[i][it] = std::exp(elements[i][it] - max_value[i]);
+                            sum[i] += elements[i][it];
+                        }
+                    } else {
+                        if (!is_log_softmax) {
+                            // Masked values are treated as -infinity, and std::exp(-infinity) is 0.
+                            elements[i][it] = 0;
+                        }
+                    }
+                } else {
+                    if (!is_log_softmax) {
+                        elements[i][it] = 0.;
+                    }
+                }
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        if (is_log_softmax) sum[i] = std::log(sum[i]);
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                if (is_log_softmax) {
+                    dst[i*element_count+it*WARP_SIZE] = elements[i][it] - max_value[i] - sum[i];
+                } else if (sum[i] == 0) {
+                    dst[i*element_count+it*WARP_SIZE] = std::numeric_limits<acc_t>::quiet_NaN();
+                } else {
+                    dst[i*element_count+it*WARP_SIZE] = elements[i][it] / sum[i];
+                }
+            } else {
+                break;
+            }
+        }
+    }
+}
+
+template <typename input_t, typename output_t, typename acc_t, int log2_elements, bool is_log_softmax, bool is_masked>
+__global__ void softmax_warp_backward(output_t *gradInput, const input_t *grad, const input_t *output, int batch_size, int stride, int element_count, const bool *mask = nullptr)
+{
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel.
+    constexpr int next_power_of_two = 1 << log2_elements;
+    constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE;
+    constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
+    constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
+
+    int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
+
+    // batch_size might not be a multiple of WARP_BATCH. Check how
+    // many batches have to computed within this WARP.
+    int local_batches = batch_size - first_batch;
+    if (local_batches > WARP_BATCH)
+        local_batches = WARP_BATCH;
+
+    // there might be multiple batches per warp. compute the index within the batch
+    int local_idx = threadIdx.x % WARP_SIZE;
+
+    // the first element to process by the current thread
+    int thread_offset = first_batch * stride + local_idx;
+    grad += thread_offset;
+    output += thread_offset;
+    gradInput += thread_offset;
+    if (is_masked) {
+        mask += thread_offset;
+    }
+
+    // The nested loops over WARP_BATCH and then WARP_ITERATIONS can be simplified to one loop,
+    // but I think doing so would obfuscate the logic of the algorithm, thus I chose to keep
+    // the nested loops.
+    // This should have no impact on performance because the loops are unrolled anyway.
+
+    // load data from global memory
+    acc_t grad_reg[WARP_BATCH][WARP_ITERATIONS];
+    acc_t output_reg[WARP_BATCH][WARP_ITERATIONS];
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        int batch_element_count = (i >= local_batches) ? 0 : element_count;
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < batch_element_count) {
+                grad_reg[i][it] = grad[i*element_count+it*WARP_SIZE];
+                output_reg[i][it] = output[i*element_count+it*WARP_SIZE];
+            } else {
+                grad_reg[i][it] = acc_t(0);
+                output_reg[i][it] = acc_t(0);
+            }
+        }
+    }
+
+    acc_t sum[WARP_BATCH] { 0.0f };
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            if (!is_masked || !mask[i*element_count+it*WARP_SIZE]) {
+                sum[i] += grad_reg[i][it];
+            }
+        }
+    }
+    warp_reduce<acc_t, WARP_BATCH, WARP_SIZE, Add>(sum);
+
+    // store result
+    #pragma unroll
+    for (int i = 0;  i < WARP_BATCH;  ++i) {
+        if (i >= local_batches)
+            break;
+        #pragma unroll
+        for (int it = 0;  it < WARP_ITERATIONS;  ++it) {
+            int element_index = local_idx + it * WARP_SIZE;
+            if (element_index < element_count) {
+                if (is_masked && mask[i*element_count+it*WARP_SIZE]) {
+                    gradInput[i*element_count+it*WARP_SIZE] = 0;
+                }
+                // compute gradients
+                else if (is_log_softmax) {
+                    gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - std::exp(output_reg[i][it]) * sum[i]);
+                } else {
+                    gradInput[i*element_count+it*WARP_SIZE] = (grad_reg[i][it] - output_reg[i][it] * sum[i]);
+                }
+            }
+        }
+    }
+}
+
+} // end of anonymous namespace
+
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
+void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr, int chunk_size = -1, bool is_transformer_mask = false)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 );
+    if (softmax_elements == 0) {
+        return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
+        int warp_size = at::cuda::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            #define LAUNCH_SOFTMAX_WARP_FORWARD(L2E) case L2E:                    \
+            softmax_warp_forward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked>   \
+                <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst,   \
+                    src, batch_count, softmax_elements_stride, softmax_elements, mask, chunk_size, is_transformer_mask); \
+            C10_CUDA_KERNEL_LAUNCH_CHECK();                                       \
+            break;
+
+            LAUNCH_SOFTMAX_WARP_FORWARD(0);  // 1
+            LAUNCH_SOFTMAX_WARP_FORWARD(1);  // 2
+            LAUNCH_SOFTMAX_WARP_FORWARD(2);  // 4
+            LAUNCH_SOFTMAX_WARP_FORWARD(3);  // 8
+            LAUNCH_SOFTMAX_WARP_FORWARD(4);  // 16
+            LAUNCH_SOFTMAX_WARP_FORWARD(5);  // 32
+            LAUNCH_SOFTMAX_WARP_FORWARD(6);  // 64
+            LAUNCH_SOFTMAX_WARP_FORWARD(7);  // 128
+            LAUNCH_SOFTMAX_WARP_FORWARD(8);  // 256
+            LAUNCH_SOFTMAX_WARP_FORWARD(9);  // 512
+            LAUNCH_SOFTMAX_WARP_FORWARD(10); ; // 1024
+            default:
+                break;
+        }
+    }
+}
+
+template<typename input_t, typename output_t, typename acc_t, bool is_log_softmax, bool is_masked>
+void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const input_t *output, int softmax_elements, int softmax_elements_stride, int batch_count, const bool *mask = nullptr)
+{
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 1024 );
+    if (softmax_elements == 0) {
+       return;
+    } else {
+        int log2_elements = log2_ceil(softmax_elements);
+        const int next_power_of_two = 1 << log2_elements;
+
+        // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
+        int warp_size = at::cuda::warp_size();
+        warp_size = (next_power_of_two < warp_size) ? next_power_of_two : warp_size;
+
+        // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
+        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
+
+        // use 128 threads per block to maximize gpu utilization
+        constexpr int threads_per_block = 128;
+
+        int warps_per_block = (threads_per_block / warp_size);
+        int batches_per_block = warps_per_block * batches_per_warp;
+        int blocks = (batch_count + batches_per_block - 1) / batches_per_block;
+        dim3 threads(warp_size, warps_per_block, 1);
+        // Launch code would be more elegant if C++ supported FOR CONSTEXPR
+        switch (log2_elements) {
+            #define LAUNCH_SOFTMAX_WARP_BACKWARD(L2E) case L2E:                      \
+            softmax_warp_backward<input_t, output_t, acc_t, L2E, is_log_softmax, is_masked> \
+                <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>       \
+                (grad_input, grad, output, batch_count, softmax_elements_stride, \
+                softmax_elements, mask);                                              \
+            C10_CUDA_KERNEL_LAUNCH_CHECK();                                      \
+            break;
+
+            LAUNCH_SOFTMAX_WARP_BACKWARD(0); // 1
+            LAUNCH_SOFTMAX_WARP_BACKWARD(1); // 2
+            LAUNCH_SOFTMAX_WARP_BACKWARD(2); // 4
+            LAUNCH_SOFTMAX_WARP_BACKWARD(3); // 8
+            LAUNCH_SOFTMAX_WARP_BACKWARD(4); // 16
+            LAUNCH_SOFTMAX_WARP_BACKWARD(5); // 32
+            LAUNCH_SOFTMAX_WARP_BACKWARD(6); // 64
+            LAUNCH_SOFTMAX_WARP_BACKWARD(7); // 128
+            LAUNCH_SOFTMAX_WARP_BACKWARD(8); // 256
+            LAUNCH_SOFTMAX_WARP_BACKWARD(9); // 512
+            LAUNCH_SOFTMAX_WARP_BACKWARD(10); // 1024
+            default:
+                break;
+        }
+    }
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Reduce.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..79b23ea8afd4fc4e4d505141b43bbce06275a881
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Reduce.cuh
@@ -0,0 +1,1355 @@
+#pragma once
+
+#include <ATen/core/Array.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/DeviceUtils.cuh>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/cuda/thread_constants.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <ATen/OpMathType.h>
+#include <c10/macros/Macros.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <functional>
+#include <iosfwd>
+#include <type_traits>
+#include <utility>
+#include <thrust/pair.h>
+
+#include <ATen/native/cuda/jit_utils.h>
+
+namespace at { namespace native {
+
+using at::detail::Array;
+
+static inline int64_t div_up(int64_t a, int64_t b) {
+  return (a + b - 1) / b;
+}
+
+// returns floor(log2(n))
+static inline int last_pow2(int n) {
+  n |= (n >>  1);
+  n |= (n >>  2);
+  n |= (n >>  4);
+  n |= (n >>  8);
+  n |= (n >> 16);
+  return std::max(1, n - (n >> 1));
+}
+
+// returns reduced fraction numerator & denominator
+C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
+  // get GCD of num and denom using Euclid's algorithm.
+  // Can replace this with std::gcd if we ever support c++17.
+  size_t a = denominator;
+  size_t b = numerator;
+  while (b != 0) {
+      a %= b;
+      // swap(a,b)
+      size_t tmp = a;
+      a = b;
+      b = tmp;
+  }
+
+  // a is now the GCD
+  numerator /= a;
+  denominator /= a;
+}
+
+//template for changing MAX_NUM_THREADS based on op dtype
+template <typename T>
+struct mnt_wrapper {
+  static constexpr int MAX_NUM_THREADS = 512;
+};
+
+template <>
+struct mnt_wrapper <c10::complex<double>>{
+  static constexpr int MAX_NUM_THREADS = 256;
+};
+
+constexpr int max_reduce_threads(c10::ScalarType type) {
+  return type == kComplexDouble ? 256 : 512;
+}
+
+struct ReduceConfig {
+  static constexpr int BLOCK_X = 0;
+  static constexpr int BLOCK_Y = 1;
+  static constexpr int CTA = 2;
+
+  static constexpr int input_vec_size = 4;
+
+  ReduceConfig(int element_size_bytes, int num_outputs, int num_inputs)
+    : element_size_bytes(element_size_bytes)
+    , num_inputs(num_inputs)
+    , num_outputs(num_outputs) {}
+  int element_size_bytes;
+  int num_inputs;
+  int num_outputs;
+  int step_input = 1;
+  int step_output = 1;
+  int ctas_per_output = 1;
+  int input_mult[3] = {0, 0, 0};
+  int output_mult[2] = {0, 0};
+
+  int block_width;
+  int block_height;
+  int num_threads;
+
+  bool vectorize_input = false;
+  int output_vec_size = 1;
+
+  template <typename T>
+  void set_block_dimension(int64_t dim0, int64_t dim1) {
+    const int max_num_threads = mnt_wrapper<T>::MAX_NUM_THREADS / output_vec_size;
+    int dim0_pow2 = dim0 < max_num_threads ? static_cast<int>(last_pow2(dim0)) : max_num_threads;
+    int dim1_pow2 = dim1 < max_num_threads ? static_cast<int>(last_pow2(dim1)) : max_num_threads;
+    block_width = std::min(dim0_pow2, int(at::cuda::warp_size()));
+    block_height = std::min(dim1_pow2, int(max_num_threads / block_width));
+    block_width = std::min(dim0_pow2, int(max_num_threads / block_height));
+    num_threads = block_width * block_height;
+  }
+
+  int split_input(int parallelism) {
+    int step = step_input;
+    step_input *= parallelism;
+    return step;
+  }
+
+  int split_output(int parallelism) {
+    int step = step_output;
+    step_output *= parallelism;
+    return step;
+  }
+
+  dim3 block() const {
+    return dim3(block_width, block_height);
+  }
+
+  dim3 grid() const {
+    return dim3(div_up(num_outputs / output_vec_size, step_output), ctas_per_output);
+  }
+
+  C10_HOST_DEVICE bool should_block_x_reduce() const {
+    return input_mult[BLOCK_X] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_block_y_reduce() const {
+    return input_mult[BLOCK_Y] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_global_reduce() const {
+    return input_mult[CTA] != 0;
+  }
+
+  C10_DEVICE bool should_store(int output_idx) const {
+    return output_idx < num_outputs &&
+      (!should_block_x_reduce() || threadIdx.x == 0) &&
+      (!should_block_y_reduce() || threadIdx.y == 0);
+  }
+
+  C10_DEVICE bool should_reduce_tail() const {
+    return (!should_block_y_reduce() || threadIdx.y == 0) &&
+      (!should_global_reduce() || blockIdx.y == 0);
+  }
+
+  C10_HOST_DEVICE int input_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta2 = blockIdx.y;
+    return (lane * input_mult[BLOCK_X] +
+            warp * input_mult[BLOCK_Y] +
+            cta2 * input_mult[CTA]);
+  }
+
+  template <int output_vec_size>
+  C10_HOST_DEVICE int output_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta1 = blockIdx.x;
+    return (lane * output_mult[BLOCK_X] +
+            warp * output_mult[BLOCK_Y] +
+            cta1 * step_output) * output_vec_size;
+  }
+
+  C10_DEVICE int shared_memory_offset(int offset) const {
+    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
+  }
+
+  C10_DEVICE int staging_memory_offset(int cta2) const {
+    int offset = cta2 + blockIdx.x * gridDim.y;
+    if (!should_block_x_reduce()) {
+      offset = threadIdx.x + offset * blockDim.x;
+    }
+    return offset;
+  }
+
+  int shared_memory_size() const {
+    if (!should_block_y_reduce() &&
+        (!should_block_x_reduce() ||
+         block_width <= at::cuda::warp_size())) {
+      return 0;
+    }
+    return element_size_bytes * num_threads * output_vec_size;
+  }
+
+  int64_t global_memory_size() const {
+    if (!should_global_reduce()) {
+      return 0;
+    }
+    auto size = (int64_t)element_size_bytes * num_outputs * ctas_per_output;
+    if (!should_block_x_reduce()) {
+      size *= block().x * output_vec_size;
+    }
+    return size;
+  }
+
+  int semaphore_size() const {
+    if (!should_global_reduce()) {
+      return 0;
+    }
+    return sizeof(int) * grid().x;
+  }
+
+  int values_per_thread() const {
+    return div_up(num_inputs, step_input);
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, const ReduceConfig& config);
+
+template<int nt, int output_vec_size, typename R>
+C10_LAUNCH_BOUNDS_2(nt, 4)
+__global__ void reduce_kernel(R reduction) {
+  reduction.template run<output_vec_size>();
+}
+
+template <typename index_t>
+static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& iter) {
+  int num_reduce_dims = iter.num_reduce_dims();
+  int num_output_dims = iter.ndim() - num_reduce_dims;
+  int input_index = iter.ntensors() - 1;
+  int output_index = 0;
+  std::array<const int64_t*, 2> strides = {
+    iter.strides(output_index).data() + num_reduce_dims,
+    iter.strides(input_index).data() + num_reduce_dims,
+  };
+  auto shape = iter.shape().data() + num_reduce_dims;
+  return OffsetCalculator<2, index_t>(num_output_dims, shape, strides.data());
+}
+
+template <typename index_t>
+static OffsetCalculator<1, index_t> make_input_calculator(const TensorIterator& iter) {
+  int num_reduce_dims = iter.num_reduce_dims();
+  int input_index = iter.ntensors() - 1;
+  std::array<const int64_t*, 1> strides = {
+    iter.strides(input_index).data(),
+  };
+  return OffsetCalculator<1, index_t>(num_reduce_dims, iter.shape().data(), strides.data());
+}
+
+template <typename out_scalar_t, typename func_t>
+struct func_wrapper_t {
+  using arg_t = typename binary_function_traits<func_t>::arg1_t;
+  using scalar_t = typename binary_function_traits<func_t>::arg2_t;
+
+  func_t combine;
+  static inline __device__ out_scalar_t project(arg_t arg) {
+    return (out_scalar_t) arg;
+  }
+  static inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
+    return WARP_SHFL_DOWN(arg, offset);
+  }
+
+  static __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
+    return acc;
+  }
+
+  func_wrapper_t(const func_t& op) : combine(op) {
+  }
+
+  // wrap a normal reduction that ignores the index
+  __device__ arg_t reduce(arg_t acc, scalar_t val, int64_t idx) const {
+    return combine(acc, val);
+  }
+};
+
+template <typename scalar_t, typename func_t>
+func_wrapper_t<scalar_t, func_t> func_wrapper(const func_t& op) {
+  return func_wrapper_t<scalar_t, func_t> { op };
+}
+
+template <typename scalar_t, typename out_scalar_t=scalar_t>
+struct ReduceJitOp {
+//ReduceJitOp is almost like ReduceOp, but it doesn't have ops functor that specifies reduction operations
+//Maybe we can find a way to unify ReduceOp and ReduceJitOp
+  using InputCalculator = OffsetCalculator<1, uint32_t>;
+  using OutputCalculator = OffsetCalculator<2, uint32_t>;
+  //TODO for now arg_t is always opmath_t of the input, later we'll need to change it
+  using arg_t = at::opmath_type<scalar_t>;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+  //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor,
+  //not just wrapper
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+  ReduceJitOp(
+      ReduceConfig config,
+      InputCalculator input_calc,
+      OutputCalculator output_calc,
+      const void* src,
+      char* dst0,
+      std::optional<char*> dst1,
+      void* acc_buf,
+      void* cta_buf,
+      int* semaphores,
+      arg_t ident,
+      int noutputs,
+      int64_t base_idx)
+      : ident(ident),
+        config(config),
+        input_calc(input_calc),
+        output_calc(output_calc),
+        src(src),
+        acc_buf(acc_buf),
+        cta_buf(cta_buf),
+        semaphores(semaphores),
+        base_idx(base_idx),
+        noutputs(noutputs) {
+    dst[0] = dst0;
+    if (dst1.has_value()) {
+      dst[1] = dst1.value();
+    }
+  }
+};
+
+template <typename scalar_t, typename ops_t, typename index_t, typename out_scalar_t=scalar_t, int vt0=4>
+struct ReduceOp {
+  using traits = function_traits<decltype(&ops_t::reduce)>;
+  using arg_t = typename std::decay<typename traits::template arg<0>::type>::type;
+
+  using InputCalculator = OffsetCalculator<1, index_t>;
+  using OutputCalculator = OffsetCalculator<2, index_t>;
+
+  static constexpr bool can_accumulate_in_output =
+    std::is_convertible<arg_t, out_scalar_t>::value
+    && std::is_convertible<out_scalar_t, arg_t>::value;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+
+  ops_t ops;
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+  ReduceOp(
+      ops_t ops,
+      ReduceConfig config,
+      InputCalculator input_calc,
+      OutputCalculator output_calc,
+      const void* src,
+      char* dst0,
+      std::optional<char*> dst1,
+      void* acc_buf,
+      void* cta_buf,
+      int* semaphores,
+      arg_t ident,
+      int noutputs,
+      int64_t base_idx)
+      : ops(ops),
+        ident(ident),
+        config(config),
+        input_calc(input_calc),
+        output_calc(output_calc),
+        src(src),
+        acc_buf(acc_buf),
+        cta_buf(cta_buf),
+        semaphores(semaphores),
+        base_idx(base_idx),
+        noutputs(noutputs) {
+    dst[0] = dst0;
+    if (dst1.has_value()) {
+      dst[1] = dst1.value();
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE void run() const {
+    extern __shared__ char shared_memory[];
+    index_t output_idx = config.output_idx<output_vec_size>();
+    index_t input_idx = config.input_idx();
+    auto base_offsets1 = output_calc.get(output_idx)[1];
+
+    using arg_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    arg_vec_t value;
+
+    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
+      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
+      value = thread_reduce<output_vec_size>(input_slice);
+    }
+
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<output_vec_size>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<output_vec_size>(value, shared_memory);
+    }
+
+    using out_ptr_vec_t = at::detail::Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = at::detail::Array<index_t, output_vec_size>;
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    arg_vec_t* acc = nullptr;
+    if (acc_buf != nullptr) {
+      size_t numerator = sizeof(arg_t);
+      size_t denominator = sizeof(out_scalar_t);
+      reduce_fraction(numerator, denominator);
+      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
+    }
+
+    if (config.should_global_reduce()) {
+      value = global_reduce<output_vec_size>(value, acc, shared_memory);
+    } else if (config.should_store(output_idx)) {
+      if (accumulate) {
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = ops.translate_idx(value[i], base_idx);
+        }
+      }
+
+      if (acc == nullptr) {
+        if (accumulate) {
+          value = accumulate_in_output<output_vec_size, can_accumulate_in_output>(out, value);
+        }
+        if (final_output) {
+          set_results_to_output<output_vec_size>(value, base_offsets);
+        } else {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            *(out[i]) = get_accumulated_output<can_accumulate_in_output>(out[i], value[i]);
+          }
+        }
+      } else {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine((*acc)[i], value[i]);
+          }
+        }
+        if (final_output) {
+          set_results_to_output<output_vec_size>(value, base_offsets);
+        } else {
+          *acc = value;
+        }
+      }
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
+    if (config.vectorize_input) {
+      CUDA_KERNEL_ASSERT(output_vec_size == 1);
+      // reduce at the header of input_slice where memory is not aligned,
+      // so that thread_reduce will have an aligned memory to work on.
+      return {input_vectorized_thread_reduce_impl(data)};
+    } else {
+      index_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
+      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
+      if (is_contiguous) {
+        return thread_reduce_impl<output_vec_size>(data, [](index_t idx) { return idx; });
+      } else if (input_calc.dims == 1) {
+        return thread_reduce_impl<output_vec_size>(data, [&](index_t idx) { return idx * element_stride; });
+      } else {
+        return thread_reduce_impl<output_vec_size>(data, [&](index_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
+      }
+    }
+  }
+
+  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
+    index_t end = config.num_inputs;
+
+    // Handle the head of input slice where data is not aligned
+    arg_t value = ident;
+    constexpr int align_bytes = alignof(at::native::memory::aligned_vector<scalar_t, input_vec_size>);
+    constexpr int align_elements = align_bytes / sizeof(scalar_t);
+    int shift = ((uint64_t)data) % align_bytes / sizeof(scalar_t);
+    if (shift > 0) {
+      data -= shift;
+      end += shift;
+      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
+        value = ops.reduce(value, c10::load(data + threadIdx.x), threadIdx.x - shift);
+      }
+      end -= align_elements;
+      data += align_elements;
+      shift = align_elements - shift;
+    }
+
+    // Do the vectorized reduction
+    using load_t = at::native::memory::aligned_vector<scalar_t, input_vec_size>;
+
+    index_t idx = config.input_idx();
+    const index_t stride = config.step_input;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_t value_list[input_vec_size];
+    value_list[0] = value;
+
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[i] = ident;
+    }
+
+    while (idx * input_vec_size + input_vec_size - 1 < end) {
+      const auto values_vec = memory::load_vector<input_vec_size>(data, idx);
+      #pragma unroll
+      for (index_t i = 0; i < input_vec_size; i++) {
+        value_list[i] = ops.reduce(value_list[i], values_vec.val[i], shift + idx * input_vec_size + i);
+      }
+      idx += stride;
+    }
+
+    // tail
+    index_t tail_start = end - end % input_vec_size;
+    if (config.should_reduce_tail()) {
+      int idx = tail_start + threadIdx.x;
+      if (idx < end) {
+        const auto value = c10::load(data + idx);
+        value_list[0] = ops.reduce(value_list[0], value, idx + shift);
+      }
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[0] = ops.combine(value_list[0], value_list[i]);
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size, typename offset_calc_t>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
+    index_t idx = config.input_idx();
+    const index_t end = config.num_inputs;
+    const index_t stride = config.step_input;
+
+    using arg_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    using load_t = at::native::memory::aligned_vector<scalar_t, output_vec_size>;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_vec_t value_list[vt0];
+
+    #pragma unroll
+    for (int i = 0; i < vt0; i++) {
+      #pragma unroll
+      for (int j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ident;
+      }
+    }
+
+    load_t values[vt0];
+
+    while (idx + (vt0 - 1) * stride < end) {
+      #pragma unroll
+      for (index_t i = 0; i < vt0; i++) {
+        const auto offset = calc(idx + i * stride) / output_vec_size;
+        values[i] = memory::load_vector<output_vec_size>(data_, offset);
+      }
+      #pragma unroll
+      for (index_t i = 0; i < vt0; i++) {
+        #pragma unroll
+        for (index_t j = 0; j < output_vec_size; j++) {
+          value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx + i * stride);
+        }
+      }
+      idx += stride * vt0;
+    }
+
+    // tail
+    int idx_ = idx;
+    #pragma unroll
+    for (index_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      const auto offset = calc(idx) / output_vec_size;
+      values[i] = memory::load_vector<output_vec_size>(data_, offset);
+      idx += stride;
+    }
+    idx = idx_;
+    #pragma unroll
+    for (index_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      #pragma unroll
+      for (index_t j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx);
+      }
+      idx += stride;
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < vt0; i++) {
+      #pragma unroll
+      for (index_t j = 0; j < output_vec_size; j++) {
+        value_list[0][j] = ops.combine(value_list[0][j], value_list[i][j]);
+      }
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> block_x_reduce(at::detail::Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    int dim_x = blockDim.x;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    if (dim_x > warpSize) {
+      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
+      shared[address_base] = value;
+      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+          args_vec_t other = shared[address_base + offset];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], other[i]);
+          }
+          shared[address_base] = value;
+        }
+      }
+      dim_x = warpSize;
+    }
+
+    __syncthreads();
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+      #pragma unroll
+      for (int i = 0; i < output_vec_size; i++) {
+        arg_t other = ops.warp_shfl_down(value[i], offset);
+        value[i] = ops.combine(value[i], other);
+      }
+    }
+    return value;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> block_y_reduce(at::detail::Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    shared[config.shared_memory_offset(0)] = value;
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        args_vec_t other = shared[config.shared_memory_offset(offset)];
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = ops.combine(value[i], other[i]);
+        }
+        shared[config.shared_memory_offset(0)] = value;
+      }
+    }
+    return value;
+  }
+
+  C10_DEVICE bool mark_block_finished() const {
+    __shared__ bool is_last_block_done_shared;
+
+    __syncthreads();
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
+    }
+
+    __syncthreads();
+
+    return is_last_block_done_shared;
+  }
+
+  template <int output_vec_size, bool can_acc>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> accumulate_in_output(
+    at::detail::Array<out_scalar_t*, output_vec_size> out,
+    at::detail::Array<arg_t, output_vec_size> value,
+    typename std::enable_if<can_acc>::type* = nullptr
+  ) const {
+    at::detail::Array<arg_t, output_vec_size> ret;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      ret[i] = ops.combine(*(out[i]), value[i]);
+    }
+    return ret;
+  }
+
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value,
+    typename std::enable_if<can_acc>::type* = nullptr
+  ) const {
+    CUDA_KERNEL_ASSERT(!final_output);
+    return (out_scalar_t)value;
+  }
+
+  // This function should never be called --
+  // it's the version of `accumulate_in_output`
+  // when accumulation in the output is not possible.
+  template <int output_vec_size, bool can_acc>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> accumulate_in_output(
+    at::detail::Array<out_scalar_t*, output_vec_size>,
+    at::detail::Array<arg_t, output_vec_size>,
+    typename std::enable_if<!can_acc>::type* = nullptr
+  ) const {
+    CUDA_KERNEL_ASSERT(false);
+    return arg_t {};
+  }
+
+  // This function should never be called --
+  // it's the version of `get_accumulated_output`
+  // when accumulation in the output is not possible.
+  template <bool can_acc>
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value,
+    typename std::enable_if<!can_acc>::type* = nullptr
+  ) const {
+    CUDA_KERNEL_ASSERT(false);
+    return *out;
+  }
+
+  template<class T>
+  C10_DEVICE void set_results(const T x, const index_t base_offset) const {
+    CUDA_KERNEL_ASSERT(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+
+  //Currently implemented for max of two outputs
+  template<class T1, class T2>
+  C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
+    if (noutputs >= 1) {
+      auto res0 = (T1*)((char*)dst[0] + base_offset);
+      *res0 = x.first;
+    }
+    if (noutputs >= 2) {
+      // base offset is computed assuming element size being sizeof(T1), so we need to make a
+      // correction to obtain the correct base offset
+      auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
+      *res1 = x.second;
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE void set_results_to_output(at::detail::Array<arg_t, output_vec_size> value, at::detail::Array<index_t, output_vec_size> base_offset) const {
+    CUDA_KERNEL_ASSERT(final_output);
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      set_results(ops.project(value[i]), base_offset[i]);
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE at::detail::Array<arg_t, output_vec_size> global_reduce(at::detail::Array<arg_t, output_vec_size> value, at::detail::Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
+    using arg_vec_t = at::detail::Array<arg_t, output_vec_size>;
+    using out_ptr_vec_t = at::detail::Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = at::detail::Array<index_t, output_vec_size>;
+
+    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
+    index_t output_idx = config.output_idx<output_vec_size>();
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    bool should_store = config.should_store(output_idx);
+    if (should_store) {
+      index_t offset = config.staging_memory_offset(blockIdx.y);
+      reduce_buffer[offset] = value;
+    }
+
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+    bool is_last_block_done = mark_block_finished();
+
+    if (is_last_block_done) {
+      __threadfence(); // complete the acquire pattern after atomic
+      value = ident;
+      if (config.should_block_x_reduce()) {
+        index_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
+        index_t step = blockDim.x * blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          index_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], next[i]);
+          }
+        }
+      } else {
+        index_t input_offset = threadIdx.y;
+        index_t step = blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          index_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.combine(value[i], next[i]);
+          }
+        }
+      }
+      value = block_y_reduce(value, shared_memory);
+      if (config.should_block_x_reduce()) {
+        value = block_x_reduce<output_vec_size>(value, shared_memory);
+      }
+      if (should_store) {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = ops.translate_idx(value[i], base_idx);
+          }
+        }
+
+        if (acc == nullptr) {
+          if (accumulate) {
+            value = accumulate_in_output<output_vec_size, can_accumulate_in_output>(out, value);
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              *(out[i]) = get_accumulated_output<can_accumulate_in_output>(out[i], value[i]);
+            }
+          }
+        } else {
+          if (accumulate) {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              value[i] = ops.combine((*acc)[i], value[i]);
+            }
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            *acc = value;
+          }
+        }
+      }
+    }
+
+    return value;
+  }
+};
+
+template<int max_threads, typename R>
+static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) {
+  dim3 block = config.block();
+  dim3 grid = config.grid();
+
+  auto stream = at::cuda::getCurrentCUDAStream();
+  int shared_memory = config.shared_memory_size();
+
+  switch(config.output_vec_size) {
+  case 4:
+    reduce_kernel<max_threads / 4, 4, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    break;
+  case 2:
+    reduce_kernel<max_threads / 2, 2, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    break;
+  default:
+    reduce_kernel<max_threads / 1, 1, R><<<grid, block, shared_memory, stream>>>(reduction);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+}
+
+inline void launch_jitted_reduce_kernel(
+    std::mutex &jiterator_mutex,
+    std::array<at::cuda::jit::NvrtcFunction, 3> &fn_cache,
+    const at::cuda::jit::KernelDescriptor &desc,
+    int vt0, const ReduceConfig& config, void *reduction) {
+  dim3 block = config.block();
+  dim3 grid = config.grid();
+
+  int shared_memory = config.shared_memory_size();
+  at::cuda::jit::NvrtcFunction* fn_ptr;
+  switch(config.output_vec_size) {
+  case 4:
+    fn_ptr = &fn_cache[0];
+    break;
+  case 2:
+    fn_ptr = &fn_cache[1];
+    break;
+  default:
+    fn_ptr = &fn_cache[2];
+  }
+  if (!fn_ptr->function) {
+    int max_threads_codegen =
+        max_reduce_threads(desc.f_inputs_type) / config.output_vec_size;
+    auto code = at::cuda::jit::generate_reduction_code(
+        desc, vt0, true, false, config.output_vec_size, max_threads_codegen);
+
+    *fn_ptr = at::cuda::jit::jit_pwise_function(code, "reduction_" + desc.name);
+  }
+  constexpr int kernel_args = 1;
+  void* args[kernel_args];
+  args[0] = reduction;
+  at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, block, shared_memory);
+}
+
+
+class AccumulationBuffer {
+ public:
+  AccumulationBuffer() {}
+
+  AccumulationBuffer(size_t acc_t_size, size_t out_t_size, char* out_ptr, int64_t size) {
+    out_ptr_ = (char*)out_ptr;
+    if (out_t_size >= acc_t_size) {
+      // reusing output buffer for accumulation.
+      acc_ptr_ = (char*)out_ptr;
+      numerator_ = 1;
+      denominator_ = 1;
+    } else {
+      auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+      buffer_ = allocator.allocate(size);
+      acc_ptr_ = (char*)buffer_.get();
+      numerator_ = acc_t_size;
+      denominator_ = out_t_size;
+      reduce_fraction(numerator_, denominator_);
+    }
+  }
+
+  char* get_acc_slice(char* out_ptr) {
+    if (acc_ptr_ == nullptr) {
+      return nullptr;
+    }
+    return acc_ptr_ + ((out_ptr - out_ptr_) * numerator_ / denominator_);
+  }
+
+ private:
+  char* acc_ptr_ = nullptr;
+  char* out_ptr_ = nullptr;
+  size_t numerator_;
+  size_t denominator_;
+  at::DataPtr buffer_;
+};
+
+template <typename scalar_t>
+int get_output_vec_size(const TensorIterator &iter) {
+  int vec_size = 4;
+  auto update_vec_size = [&vec_size](uint64_t n) {
+    while(n % vec_size != 0) {
+      vec_size /= 2;
+    }
+  };
+
+  uint64_t base_address = reinterpret_cast<uint64_t>(iter.data_ptr(iter.noutputs())) / sizeof(scalar_t);
+  update_vec_size(base_address);
+
+  const int output_index = iter.num_reduce_dims();
+  update_vec_size(iter.shape()[output_index]);
+
+  int j = 0;
+  for(auto i : iter.strides(iter.noutputs())) {
+    if (j != output_index) {
+      update_vec_size(i / sizeof(scalar_t));
+    }
+    j++;
+  }
+  return vec_size;
+}
+
+template<typename arg_t, typename scalar_t, int vt0>
+ReduceConfig setReduceConfig(const TensorIterator& iter){
+  // Start by assuming that each thread handles a single output and all
+  // the inputs for that output.
+  int64_t num_outputs = iter.num_output_elements();
+  int64_t inputs_per_output = iter.numel() / num_outputs;
+  int input_index = iter.ntensors() - 1;
+
+  auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output);
+
+  int64_t dim0;
+  int64_t dim1;
+  int64_t fastest_moving_stride;
+  bool reduction_on_fastest_striding_dimension;
+
+  if (iter.ndim() > 0) {
+    // Adjust block size to map block width to fastest changing dimension of input
+    // tensor. This grants the best possible memory accessing pattern, given that
+    // for non-contiguous tensor with space in between, we cannot have perfect
+    // memory coalescing.
+    reduction_on_fastest_striding_dimension =
+        (iter.num_reduce_dims() == iter.ndim()) ||
+        (iter.strides(/*arg=*/input_index)[0] <
+        iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]);
+    // Notice that dim0 & dim1 does NOT guarantee any launch configuration here!
+    // dim0 & dim1 are more like the upper bound of the block dimension. The
+    // actual launch config and reduction scheme is determined by setting values
+    // to `config.input_mult` and `config.output_mult`.
+    // We try to max out dim1 so that we have enough threads per CTA to deliver
+    // performance for larger problem size.
+    if (reduction_on_fastest_striding_dimension) {
+      // Map block.x to the fastest reducing dimension. It implies:
+      //   1. block_x_reduce is required.
+      //   2. block.y now max out to num_outputs.
+      dim0 = inputs_per_output;
+      dim1 = num_outputs;
+      fastest_moving_stride = iter.strides(/*arg=*/input_index)[0];
+    } else {
+      // Map block.x to the fastest non reducing dimension. It implies:
+      //   1. block_x_reduce is turned off.
+      //   2. block.y now max out to inputs_per_output.
+      dim0 = num_outputs;
+      dim1 = inputs_per_output;
+      fastest_moving_stride = iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()];
+    }
+  } else {
+    reduction_on_fastest_striding_dimension = true;
+    fastest_moving_stride = sizeof(scalar_t);
+    dim0 = 1;
+    dim1 = 1;
+  }
+
+  // We do vectorization to gain better memory access, there are two cases which we call
+  // "vectorize along input" and "vectorize along output". Note that the "input/output"
+  // here does not mean we are vectorizing load/store instructions. We always only vectorize
+  // load instructions.
+  //
+  // Case 1: "vectorize along input"
+  // This case happens when we are reducing along fastest moving dimesion. In such case, threads
+  // with the same threadIdx.y works on the same reduction cooperatively and will produce results
+  // for the same output. In such case, values in each loaded vector always correspond to the same output.
+  //
+  // Case 2: "vectorize along output"
+  // This case happens when the fastest moving dimesion is not the dimension of reduction. In such case,
+  // threads with different threadIdx.x are independent and will produce results for different outputs.
+  // In such case, values in each loaded vector always correspond to different outputs.
+  if (fastest_moving_stride == sizeof(scalar_t)) {
+    if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= ReduceConfig::input_vec_size) {
+      // Case 1: "vectorize along input"
+      // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case,
+      // we should avoid vectorization.
+      config.vectorize_input = true;
+      dim0 /= config.input_vec_size;
+    } else if (!reduction_on_fastest_striding_dimension) {
+      // Case 2: "vectorize along output"
+      config.output_vec_size = get_output_vec_size<scalar_t>(iter);
+      dim0 /= config.output_vec_size;
+    }
+  }
+
+  // Adjust block_width and block_height
+  config.set_block_dimension<scalar_t>(dim0, dim1);
+
+  int block_width = config.block_width;
+  int block_height = config.block_height;
+
+  if (iter.ndim() == 0 || reduction_on_fastest_striding_dimension) {
+    // Split the input across lanes if the input is contiguous in the reduced
+    // dimension. This will require reduction between threads using warp
+    // shuffle instructions and shared memory (if block_width > warpSize).
+    config.input_mult[0] = config.split_input(block_width);
+  } else {
+    // Otherwise split the output across lanes in a warp.
+    config.output_mult[0] = config.split_output(block_width);
+  }
+
+  constexpr int min_values_per_thread = 16;
+  constexpr int max_values_per_thread = 256;
+
+  if (config.values_per_thread() >= block_height * 16 || config.values_per_thread() >= max_values_per_thread) {
+    // Divide the input across warps in a thread-block, if that leaves at least
+    // 16 elements to be summed by each thread. This will require inter-warp
+    // reduction using shared memory.
+    config.input_mult[1] = config.split_input(block_height);
+  } else {
+    // Otherwise, each warp handles a separate output.
+    config.output_mult[1] = config.split_output(block_height);
+  }
+
+  const int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / config.num_threads;
+  const int num_mp = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  const int target_grid_size = num_mp * blocks_per_sm;
+  int grid = config.grid().x;
+  if (config.input_mult[1] != 0 && config.values_per_thread() >= max_values_per_thread && grid <= target_grid_size) {
+    // Divide the input across thread-blocks if the amount of work per-thread
+    // is large enough and the size of the output is small enough. This will
+    // require a reduction using global memory.
+    // If we decide to split input across blocks, as long as we can get enough
+    // number of blocks (`target_grid_size`) to balance SM, we should still
+    // make the number of values per thread large for best performance.
+    int ctas_per_output1 = div_up(target_grid_size, grid);
+    int ctas_per_output2 = div_up(config.values_per_thread(), min_values_per_thread);
+    int ctas_per_output3 = div_up(config.values_per_thread(), max_values_per_thread);
+    // We want the minimum of ctas_per_output1 and ctas_per_output2, so that each thread can have
+    // a large number of values to deal with. But we don't want values_per_thread to be larger than
+    // max_values_per_thread
+    config.ctas_per_output = std::max(std::min<int>(ctas_per_output1, ctas_per_output2), ctas_per_output3);
+    if (config.ctas_per_output > 1) {
+      config.input_mult[2] = config.split_input(config.ctas_per_output);
+    }
+  }
+  return config;
+};
+
+template <typename scalar_t, typename out_scalar_t, int vt0=4, typename ops_t, typename ident_t=double>
+inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0,
+                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
+  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
+
+  using traits = function_traits<decltype(&ops_t::reduce)>;
+  using arg_t = typename traits::template arg<0>::type;
+  // at::Half/at::ComplexHalf overflows easily as it's range is very small.
+  // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_half_or_chalf =
+      (std::is_same<at::Half, scalar_t>::value &&
+       std::is_same<at::Half, out_scalar_t>::value) ||
+      (std::is_same<c10::complex<Half>, scalar_t>::value &&
+       std::is_same<c10::complex<Half>, out_scalar_t>::value);
+  // at::BFloat16 has lower precision and can lead to rounding errors.
+  // So when scalar_t and out_scalar_t are at::BFloat16, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_bfloat16 =
+      (std::is_same<at::BFloat16, scalar_t>::value &&
+       std::is_same<at::BFloat16, out_scalar_t>::value);
+  static constexpr bool can_accumulate_in_output =
+      std::is_convertible<arg_t, out_scalar_t>::value &&
+      !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
+
+  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
+  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+  // The acc_buf_ptr is a shared pointer. It is create at the first entrance and
+  // reused by all recursive function calls.
+  if (acc_buf_ptr == NULL) {
+    // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
+    // when accumulation in output is not possible.
+    if (!can_accumulate_in_output && !can_use_32bit_indexing) {
+      int64_t output_memory_size = iter.element_size(0);
+      for (int dim = 0; dim < iter.ndim(); dim++) {
+        output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
+      }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
+      owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t),
+                                                 sizeof(out_scalar_t),
+                                                 (char*) iter.data_ptr(0),
+                                                 output_memory_size * sizeof(arg_t)));
+    } else {
+      owned_buf_ptr.reset(new AccumulationBuffer());
+    }
+    acc_buf_ptr = owned_buf_ptr.get();
+  }
+
+  if (!can_use_32bit_indexing) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
+
+      gpu_reduce_kernel<scalar_t, out_scalar_t, vt0>(sub_iter, ops, ident,
+          acc_buf_ptr, sub_iter_base_idx);
+    }
+    return;
+  }
+
+  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
+  char* out_data = (char*)iter.data_ptr(0);
+  const auto noutputs = iter.noutputs();
+  std::optional<char*> out_data_extra;
+  if (noutputs > 1) {
+    out_data_extra = (char*)iter.data_ptr(1);
+  } else {
+    out_data_extra = std::nullopt;
+  }
+  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
+
+  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
+  at::DataPtr buffer;
+  at::DataPtr semaphores;
+  if (config.should_global_reduce()) {
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+    buffer = allocator.allocate(config.global_memory_size());
+    semaphores = allocator.allocate(config.semaphore_size());
+
+    auto stream = at::cuda::getCurrentCUDAStream();
+    AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream));
+  }
+
+  AT_ASSERT(can_use_32bit_indexing);
+  auto output_calc = make_output_calculator<uint32_t>(iter);
+  auto input_calc = make_input_calculator<uint32_t>(iter);
+  auto reduce = ReduceOp<scalar_t, ops_t, uint32_t, out_scalar_t, vt0>(
+      ops,
+      config,
+      input_calc,
+      output_calc,
+      in_data,
+      out_data,
+      out_data_extra,
+      acc_data,
+      buffer.get(),
+      (int*)semaphores.get(),
+      ident,
+      noutputs,
+      base_idx);
+  reduce.accumulate = iter.should_accumulate();
+  reduce.final_output = iter.is_final_output();
+
+  launch_reduce_kernel<mnt_wrapper<scalar_t>::MAX_NUM_THREADS>(config, reduce);
+}
+
+//TODO this is 100 lines of almost-copy-paste, because we have to have different template args for this function
+//try unifying with gpu_reduce_kernel
+template <char const* name, typename scalar_t, typename out_scalar_t, int vt0=4, typename ident_t=double>
+inline void jitted_gpu_reduce_kernel(TensorIterator& iter, const std::string& func, ident_t ident=0,
+                              AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) {
+  AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1);
+
+  //TODO - this will be different for more complicated reductions, but for now reductions using
+  //func_wrapper all have arg_t = opmath
+  using arg_t = at::opmath_type<scalar_t>;
+  // at::Half/at::ComplexHalf overflows easily as it's range is very small.
+  // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_half_or_chalf =
+      (std::is_same<at::Half, scalar_t>::value &&
+       std::is_same<at::Half, out_scalar_t>::value) ||
+      (std::is_same<c10::complex<Half>, scalar_t>::value &&
+       std::is_same<c10::complex<Half>, out_scalar_t>::value);
+  // at::BFloat16 has lower precision and can lead to rounding errors.
+  // So when scalar_t and out_scalar_t are at::BFloat16, we
+  // set can_accumulate_in_output to False.
+  static constexpr bool is_inp_out_type_bfloat16 =
+      (std::is_same<at::BFloat16, scalar_t>::value &&
+       std::is_same<at::BFloat16, out_scalar_t>::value);
+  static constexpr bool can_accumulate_in_output =
+      std::is_convertible<arg_t, out_scalar_t>::value &&
+      !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16);
+
+  bool can_use_32bit_indexing = iter.can_use_32bit_indexing();
+  std::unique_ptr<AccumulationBuffer> owned_buf_ptr;
+
+  // The acc_buf_ptr is a shared pointer. It is create at the first entrance and
+  // reused by all recursive function calls.
+  if (acc_buf_ptr == NULL) {
+    // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter
+    // when accumulation in output is not possible.
+    if (!can_accumulate_in_output && !can_use_32bit_indexing) {
+      int64_t output_memory_size = iter.element_size(0);
+      for (int dim = 0; dim < iter.ndim(); dim++) {
+        output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]);
+      }
+      output_memory_size /= iter.element_size(0); //iter.strides is in bytes
+      owned_buf_ptr.reset(new AccumulationBuffer(sizeof(out_scalar_t), //TODO
+                                                 sizeof(out_scalar_t),
+                                                 (char*) iter.data_ptr(0),
+                                                 output_memory_size * sizeof(out_scalar_t))); //TODO
+    } else {
+      owned_buf_ptr.reset(new AccumulationBuffer());
+    }
+    acc_buf_ptr = owned_buf_ptr.get();
+  }
+
+  if (!can_use_32bit_indexing) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      int64_t sub_iter_base_idx = sub_iter.view_offsets()[0];
+
+      jitted_gpu_reduce_kernel<name, scalar_t, out_scalar_t, vt0>(sub_iter, func, ident,
+          acc_buf_ptr, sub_iter_base_idx);
+    }
+    return;
+  }
+
+  //TODO - for now we support a single input, we may be able to relax this constraint
+  const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1);
+  char* out_data = (char*)iter.data_ptr(0);
+  const auto noutputs = iter.noutputs();
+  std::optional<char*> out_data_extra;
+  if (noutputs > 1) {
+    out_data_extra = (char*)iter.data_ptr(1);
+  } else {
+    out_data_extra = std::nullopt;
+  }
+  char* acc_data = acc_buf_ptr->get_acc_slice(out_data);
+
+  ReduceConfig config = setReduceConfig<arg_t, scalar_t, vt0>(iter);
+
+  at::DataPtr buffer;
+  at::DataPtr semaphores;
+  if (config.should_global_reduce()) {
+    auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+    buffer = allocator.allocate(config.global_memory_size());
+    semaphores = allocator.allocate(config.semaphore_size());
+
+    auto stream = at::cuda::getCurrentCUDAStream();
+    AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream));
+  }
+
+  AT_ASSERT(can_use_32bit_indexing);
+  auto output_calc = make_output_calculator<uint32_t>(iter);
+  auto input_calc = make_input_calculator<uint32_t>(iter);
+  auto reduce = ReduceJitOp<scalar_t, out_scalar_t>(
+      config,
+      input_calc,
+      output_calc,
+      in_data,
+      out_data,
+      out_data_extra,
+      acc_data,
+      buffer.get(),
+      (int*)semaphores.get(),
+      ident,
+      noutputs,
+      base_idx);
+  reduce.accumulate = iter.should_accumulate();
+  reduce.final_output = iter.is_final_output();
+
+  constexpr int nInputs = 1;
+  constexpr int nOutputs = 1;
+  static auto desc = at::cuda::jit::make_kernel_descriptor<
+    out_scalar_t, scalar_t>(name, func, nInputs, nOutputs);
+
+  static std::mutex jiterator_mutex;
+  static std::vector<std::array<at::cuda::jit::NvrtcFunction, 3>> fn_cache(c10::cuda::device_count());
+  auto &cache = fn_cache[iter.device().index()];
+
+  launch_jitted_reduce_kernel(
+      jiterator_mutex, cache, desc, vt0, config, &reduce);
+}
+
+}} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ReduceOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ReduceOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..a67a019ae49e2ec73be935aa4cae703417e9bb19
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ReduceOps.h
@@ -0,0 +1,20 @@
+
+namespace at {
+struct TensorIterator;
+}
+
+namespace c10 {
+class Scalar;
+}
+
+namespace at { namespace native {
+
+void norm_launch_kernel(TensorIterator &iter, double val);
+void min_launch_kernel(TensorIterator &iter);
+void max_launch_kernel(TensorIterator &iter);
+void aminmax_launch_kernel(TensorIterator &iter);
+void min_all_launch_kernel(TensorIterator &iter);
+void max_all_launch_kernel(TensorIterator &iter);
+void aminmax_allreduce_launch_kernel(TensorIterator &iter);
+
+}}  // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanKernels.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanKernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..28e65372511bc7b50390a134e01554b6fa9ee171
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/ScanKernels.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+
+namespace native {
+
+// NOTE: these functions require output tensors to be contiguous
+void launch_cummax_cuda_kernel(const TensorBase& self, const TensorBase& values,
+                               const TensorBase& indices, int64_t dim);
+void launch_cummin_cuda_kernel(const TensorBase& self, const TensorBase& values,
+                               const TensorBase& indices, int64_t dim);
+void launch_logcumsumexp_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+void launch_cumsum_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+void launch_cumprod_cuda_kernel(const TensorBase& result, const TensorBase& self, int64_t dim);
+
+}}  // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h
new file mode 100644
index 0000000000000000000000000000000000000000..656b4ce2c2bbac167457f31e8f554a5e409a2940
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <cstdint>
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/cuda/SortStable.h>
+
+namespace at {
+namespace native {
+
+inline bool should_use_small_sort(const TensorBase &self, int64_t dim) {
+  return self.size(dim) <= 4096;
+}
+
+void sortKeyValueInplace(
+    const TensorBase &key, const TensorBase &value, int dim,
+    bool descending, bool stable=false);
+
+}}  // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1aefc2474fdfc5cab69c89a8fdb1b8be982394e8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh
@@ -0,0 +1,435 @@
+#pragma once
+
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/SortingCommon.cuh>
+#include <ATen/native/cuda/block_reduce.cuh>
+
+namespace at {
+namespace native {
+
+// Used for a segmented reduction
+struct ModeUnsignedBoolPair {
+  unsigned int val;
+  bool flag;
+};
+
+// In the kernel below, we have a common pattern of reducing (unsigned int,
+// unsigned int) pairs of data
+struct ModeUnsignedPair {
+  unsigned int val;
+  unsigned int index;
+};
+
+// Inclusive Scan via an upsweep/downsweep mechanism. Assumes:
+//
+// 1. Power2ScanSize is a power of 2. This code still works for collections that
+// do not exactly contain a power of 2 number of elements, simply round up to
+// the nearest power of 2 and then call.
+//
+// 2. That there are two-elements per thread, i.e. the size of the smem storage
+// is 2 * blockDim.x * sizeof(T).
+//
+// Consider a (+)-Scan on the following elements:
+//
+// Upsweep:
+//
+//    0  1  2  3  4  5  6  7
+//       1     5     9    13
+//             6          22
+//                        28
+//
+// Downsweep:
+//                  15
+//         3     10    21
+template <int Power2ScanSize, typename T, class BinaryOp>
+__device__ void inclusivePrefixScan(T* smem, BinaryOp binop) {
+  // Reduce step ("upsweep")
+#pragma unroll
+  for (int stride = 1; stride < Power2ScanSize; stride <<= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if (index < Power2ScanSize) {
+      smem[index] = binop(smem[index], smem[index - stride]);
+    }
+    __syncthreads();
+  }
+
+  // Post-reduce step ("downsweep")
+#pragma unroll
+  for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) {
+    int index = (threadIdx.x + 1) * stride * 2 - 1;
+    if ((index + stride) < Power2ScanSize) {
+      smem[index + stride] = binop(smem[index + stride], smem[index]);
+    }
+    __syncthreads();
+  }
+}
+
+// Block-wide reduction where each thread locally reduces N
+// values before letting a single warp take over - assumes
+// threadVals is in registers, not shared memory
+//
+// If smem is not used again, there is no need to __syncthreads before this
+// call. However, if smem will be used, e.g., this function is called in a loop,
+// then __syncthreads is needed either before or afterwards to prevent non-0
+// threads overriding smem in the next loop before num-0 thread reads from it.
+template <int N, typename T, typename ReduceOp>
+__device__ T reduceBlockWithNThreadLocalReductions(
+    T* smem,
+    T threadVals[N],
+    const unsigned int numVals,
+    ReduceOp reduceOp,
+    T init) {
+  int offset = threadIdx.x * N;
+  T local = offset < numVals ? threadVals[0] : init;
+
+#pragma unroll
+  for (int i = 1; i < N; ++i) {
+    ++offset;
+    T next = offset < numVals ? threadVals[i] : init;
+    local = reduceOp.combine(local, next);
+  }
+
+  return cuda_utils::BlockReduce(local, reduceOp, init, smem);
+}
+
+template <typename T>
+__device__ inline void swapVars(T& t1, T& t2) {
+  T tmp = t1;
+  t1 = t2;
+  t2 = tmp;
+}
+
+template <typename Comparator, typename K, typename V>
+__device__ inline void bitonicSwap(
+    K& kA,
+    V& vA,
+    bool& validA,
+    K& kB,
+    V& vB,
+    bool& validB,
+    bool dir,
+    const Comparator& comp) {
+  // Invalid entries always sort to the end
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(vA, vB);
+    swapVars(validA, validB);
+  }
+};
+
+template <typename Comparator, typename K>
+__device__ inline void bitonicSwapKeys(
+    K& kA,
+    bool& validA,
+    K& kB,
+    bool& validB,
+    bool dir,
+    const Comparator& comp) {
+  bool swap = (comp(kA, kB) && validA) || !validB;
+  if (swap == dir) {
+    swapVars(kA, kB);
+    swapVars(validA, validB);
+  }
+}
+
+template <
+    typename K,
+    typename IndexType,
+    int Power2SortSize,
+    typename Comparator>
+__device__ inline void bitonicSortKeys(
+    K keys[Power2SortSize],
+    bool valid[Power2SortSize],
+    const Comparator& comp) {
+#if !defined(USE_ROCM)
+#pragma unroll
+#endif
+  for (unsigned int size = 2; size < Power2SortSize; size *= 2) {
+    bool flag = ((threadIdx.x & (size / 2)) != 0);
+
+#if !defined(USE_ROCM)
+#pragma unroll
+#endif
+    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
+      __syncthreads();
+
+      unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      bitonicSwapKeys<Comparator, K>(
+          keys[pos],
+          valid[pos],
+          keys[pos + stride],
+          valid[pos + stride],
+          flag,
+          comp);
+    }
+  }
+
+#if !defined(USE_ROCM)
+#pragma unroll
+#endif
+  for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) {
+    __syncthreads();
+
+    unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    bitonicSwapKeys<Comparator, K>(
+        keys[pos],
+        valid[pos],
+        keys[pos + stride],
+        valid[pos + stride],
+        false,
+        comp);
+  }
+
+  __syncthreads();
+}
+
+// The mode kernel has the following characteristics: It uses internal shared
+// memory buffers of Power2Size, which must be greater than the number of
+// elements. Additionally, there is one block for every slice to calculate the
+// mode for, and in each block there is one thread for every two elements.
+//
+// Both sorted and positions are assumed to be contiguous Tensors with the mode
+// dimension as the innermost dim, such that we can get the particular slice for
+// a Tensor via its linear block dimension * the slice size.
+template <typename T, unsigned int Power2Size>
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11070
+__launch_bounds__(1024, 1)
+#endif
+__global__ void compute_mode(
+    const T* input,
+    at::cuda::detail::TensorInfo<T, unsigned int> values,
+    at::cuda::detail::TensorInfo<int64_t, unsigned int> indices,
+    int64_t sliceSize,
+    int64_t slices) {
+  int tidx = threadIdx.x;
+  int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for
+
+  // First, we need to calculate the offset into the sorted Tensor that
+  // represents the start of the slice for this block to calculate the mode for.
+  // This offset is a combination of the gridIndices, and the number of elements
+  // in the slice.
+  unsigned int blockId = getLinearBlockId<unsigned int>();
+  unsigned int linearOffset = blockId * sliceSize;
+
+  if (blockId >= slices) {
+      return;
+  }
+
+  // shmem is a dynamically sized buffer we will use throughout the kernel to
+  // handle computation efficiently. The size of this shmem must be
+  // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size)
+  //
+  // Initially, the buffer will be organized as follows:
+  //
+  // [smem (slice elements) | bmem (valid indices) | <scratch space>]
+  extern __shared__ char shmem[];
+
+  // smem represents a proportion of the shared memory buffer that is used to
+  // store the elements from the slice:
+  T* smem = reinterpret_cast<T*>(shmem);
+
+  // Each thread loads up to two elements from the Tensor into shared memory
+  if (tidx < sliceSize) {
+    smem[tidx] = c10::load(&input[linearOffset + tidx]);
+  }
+  if (stidx < sliceSize) {
+    smem[stidx] = c10::load(&input[linearOffset + stidx]);
+  }
+
+  // Next, we initialize a boolean region of the buffer, offset by the loaded
+  // element smem region
+  bool* bmem = reinterpret_cast<bool*>(&smem[Power2Size]);
+
+  // The first use of this region stores bmem[i] = i < sliceSize to mark the
+  // valid components in the smem buffer
+  bmem[tidx] = tidx < sliceSize;
+  bmem[stidx] = stidx < sliceSize;
+  __syncthreads(); // barrier for smem, bmem initialization
+
+  // First, sort the input slice in ascending order. smem contains the input
+  // elements, and bmem marks the valid indices
+  bitonicSortKeys<T, unsigned int, Power2Size>(
+      smem, bmem, [&] GPU_LAMBDA(const auto& a, const auto& b) {
+        return a < b;
+      });
+  __syncthreads(); // make no assumptions that the sort syncs at end
+
+  // The next step of our algorithm is performing a block-wide comparison of
+  // neighboring elements. In particular, given an sorted input slice A, we
+  // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise
+  // 0.
+  //
+  // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8]
+  //                 B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  //
+  // In particular, we can think of B[i] true indicating the start of a sequence
+  // of equal values in the sorted list. Similarly, we will also store the
+  // negation of B, which we'll call C. In particular, we can think of C[i] =
+  // true iff A[i-1] == A[i] in our original sorted slice.
+  //
+  //                 C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+
+  // We overwrite bmem, and treat the rest of shared memory as a buffer of
+  // (index, flag) pairs where the index represents values from C, and the flag
+  // represents values from B.
+  //
+  // [smem (sorted slice) | ubpmem (index, flag pairs)]
+
+  struct ModeUnsignedBoolPair* ubpmem =
+      reinterpret_cast<struct ModeUnsignedBoolPair*>(&smem[Power2Size]);
+
+  if (tidx == 0) {
+    ubpmem[0].flag = true;
+    ubpmem[0].val = 0;
+  }
+
+  // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ...
+  ubpmem[tidx * 2 + 1].flag =
+      smem[tidx * 2] != smem[tidx * 2 + 1]; // (0, 1), (1, 2), etc.
+  ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag;
+
+  // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ...
+  if (((tidx + 1) * 2) < Power2Size) {
+    ubpmem[(tidx + 1) * 2].flag =
+        smem[((tidx + 1) * 2) - 1] != smem[(tidx + 1) * 2];
+    ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag;
+  }
+  __syncthreads(); // barrier for ubpmem initialization
+
+  // Next, we perform a segmented prefix sum on the neighboring elements, where
+  // the presence of a one indicates the start of a segment. In this case B acts
+  // as the segment start flags, and C is the buffer to be summed:
+  //
+  // Input  (C)  = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0]
+  // Flag   (B)  = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1]
+  // Output (C)  = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0]
+  //
+  // Afterwards, the (index) components of the ubpmem buffer contain the lengths
+  // of the segments (minus 1), i.e. the counts of each element in the original
+  // input.
+  inclusivePrefixScan<Power2Size>(
+      ubpmem, [=] GPU_LAMBDA(const auto& a, const auto& b) {
+        ModeUnsignedBoolPair c;
+        c.val = a.flag ? a.val : a.val + b.val;
+        c.flag = a.flag | b.flag;
+        return c;
+      });
+  // assumes scan syncs at the end
+
+  // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e.
+  // we treat the boolean flag regions as integers). We initialize these to
+  // represent indices, and we'll call this buffer I
+  struct ModeUnsignedPair* uupmem =
+      reinterpret_cast<struct ModeUnsignedPair*>(ubpmem);
+
+  // At this point, we need to find the maximum element in lengths buffer C.
+  // This element will represent the count (-1) of the mode. Because of the
+  // way we have set up the problem, the index where this mode occurs will
+  // also be the location of the mode value in the sorted array, e.g.
+  //
+  // smem = [0, 0, 1, 1, 1, 2]
+  // C    = [0, 1, 0, 1, 2, 0]
+  // I    = [0, 1, 2, 3, 4, 5]
+  //                     ^
+  //                     maximum value, also aligned with mode = 1
+  //
+  // We perform a block wide max-reduction of the C buffer, but we also need the
+  // indices to come along with it, so we utilize the uupmem construction.
+  //
+  // At the end we need to return the ModeUnsignedPair containing index = 4, val
+  // = 2, which represents the max
+
+  // In practice, we will make each thread locally reduce 2 values in its
+  // registers prior to the global block-wide reduction. Note that instead of
+  // tidx/stidx, we utilize tidx * 2, tidx * 2 + 1, so each thread deals with
+  // adjacent elements. This is because the reduce code below relies on thread
+  // elements to be adjacent.
+  struct ModeUnsignedPair uup[2];
+  uup[0].index = tidx * 2;
+  uup[0].val = ubpmem[tidx * 2].val;
+  uup[1].index = tidx * 2 + 1;
+  uup[1].val = ubpmem[tidx * 2 + 1].val;
+  __syncthreads();
+
+  struct ModeUnsignedPair max = {0, 0};
+
+  struct MaxOp {
+    inline __device__ ModeUnsignedPair combine(ModeUnsignedPair a, ModeUnsignedPair b) const {
+      return b.val > a.val ? b : a;
+    }
+
+    inline __device__ ModeUnsignedPair warp_shfl_down(ModeUnsignedPair acc, int offset) const {
+      ModeUnsignedPair ret;
+      ret.index = WARP_SHFL_DOWN(acc.index, offset);
+      ret.val = WARP_SHFL_DOWN(acc.val, offset);
+      return ret;
+    }
+  } max_op;
+
+  max = reduceBlockWithNThreadLocalReductions<2>(
+      uupmem,
+      uup,
+      sliceSize,
+      max_op,
+      max);
+
+  // Store the mode in shared memory for use in finding the mode in the input
+  // slice
+  __shared__ T mode;
+
+  // Given the above constraints, the mode is the value at the reduced index in
+  // the original sorted element buffer
+  if (tidx == 0) {
+    mode = smem[max.index];
+  }
+  __syncthreads(); // broadcast mode
+
+  // Finally, we need to find "an" index of the mode in the input
+  // Tensor. The API does not constrain which index we pick, but here
+  // we always pick the largest index. We store the index if the value
+  // is the mode, or 0 otherwise. Then find the maximum value.
+  //
+  // Again we reduce 2 elements in the thread's registers prior to the
+  // block-wide reduction
+  unsigned mode_index[2] = {0u, 0u};
+  if (tidx * 2 < sliceSize) {
+    const unsigned idx = tidx * 2;
+    mode_index[0] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u;
+  }
+  if (tidx * 2 + 1 < sliceSize) {
+    const unsigned idx = tidx * 2 + 1;
+    mode_index[1] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u;
+  }
+
+  struct MaxIndexOp {
+    inline __device__ unsigned combine(unsigned a, unsigned b) const {
+      return b > a ? b : a;
+    }
+
+    inline __device__ unsigned warp_shfl_down(unsigned acc, int offset) const {
+      return WARP_SHFL_DOWN(acc, offset);
+    }
+  } max_index_op;
+
+  int64_t index = reduceBlockWithNThreadLocalReductions<2>(
+      reinterpret_cast<unsigned*>(&shmem[0]),
+      mode_index,
+      sliceSize,
+      max_index_op,
+      0u);
+
+  // Finally, we have the mode, and an index where it occurs. We use a single
+  // thread to place this in the appropriate output position
+  if (tidx == 0) {
+    unsigned int outputOffset =
+        at::cuda::detail::IndexToOffset<T, unsigned int, -1>::get(
+            blockId, values);
+    values.data[outputOffset] = mode;
+    indices.data[outputOffset] = index;
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5660747997d4eb1ad56f79ec2d1f519921c05c2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+
+void launch_fused_mode_kernel(
+    const TensorBase &values, const TensorBase &indices,
+    const TensorBase &self, int64_t slice_size, int64_t slices);
+
+void launch_apply_mode_kernel(
+    const TensorBase &values, const TensorBase &indices,
+    const TensorBase &self, int64_t dim, int64_t ndim);
+
+}}  // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6e1cccc2e175cb26b1ee12690d67d1514e95f246
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/UniqueCub.cuh
@@ -0,0 +1,16 @@
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+namespace internal {
+
+template <typename scalar_t>
+std::tuple<Tensor, Tensor, Tensor> unique_cuda_template(
+    const Tensor& self,
+    const bool consecutive,
+    const bool return_inverse,
+    const bool return_counts);
+
+} // namespace internal
+} // namespace at
+} // namespace native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d95eaaeccee2e93a675728e5d8e810403caca978
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adam_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf);
+
+void _fused_adam_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8369496fb31d74e2a1331f9f88c8cea4b99f1c6f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh
@@ -0,0 +1,38 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adam_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf);
+
+void _fused_adam_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4d1fa405ab9639bd2a449fd3c7ced53af628a7e9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh
@@ -0,0 +1,40 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf);
+
+void _fused_adamw_amsgrad_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList max_exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..32d580c31d118e1888f591ee7f74ee48bce462e7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh
@@ -0,0 +1,38 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const double lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf);
+
+void _fused_adamw_cuda_impl_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList exp_avgs,
+    at::TensorList exp_avg_sqs,
+    at::TensorList state_steps,
+    const at::Tensor& lr,
+    const double beta1,
+    const double beta2,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const std::optional<at::Tensor>& grad_scale,
+    const std::optional<at::Tensor>& found_inf);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6d1e861493d4f0fe2d0038c280f35741ff3add5e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh
@@ -0,0 +1,681 @@
+namespace at {
+namespace cuda {
+//windows doesn't like large string literals, so split in two
+const std::string reduction_template_0 = R"ESCAPE(
+  #define C10_HOST_DEVICE __host__ __device__
+  #define C10_DEVICE __device__
+  #if defined(__clang__) && defined(__HIP__)
+  #ifndef __forceinline__
+  #define __forceinline__ inline __attribute__((always_inline))
+  #endif
+  // until ROCm support for kernel asserts is restored
+  #define assert(expr) (static_cast<void>(0))
+  #endif
+
+  template <typename T>
+  __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+  #if defined(__clang__) && defined(__HIP__)
+    return __shfl_down(value, delta, width);
+  #else
+    return __shfl_down_sync(mask, value, delta, width);
+  #endif
+  }
+
+
+  #if ${complex}
+  template <typename T>
+  __device__ __forceinline__ std::complex<T> WARP_SHFL_DOWN(std::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
+  {
+    return std::complex<T>(
+  #if defined(__clang__) && defined(__HIP__)
+        __shfl_down(value.real(), delta, width),
+        __shfl_down(value.imag(), delta, width));
+  #else
+        __shfl_down_sync(mask, value.real(), delta, width),
+        __shfl_down_sync(mask, value.imag(), delta, width));
+  #endif
+  }
+  #endif
+
+  // aligned vector generates vectorized load/store on CUDA
+  template<typename scalar_t, int vec_size>
+  struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
+    scalar_t val[vec_size];
+  };
+
+
+  C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
+    // get GCD of num and denom using Euclid's algorithm.
+    // Can replace this with std::gcd if we ever support c++17.
+    size_t a = denominator;
+    size_t b = numerator;
+    while (b != 0) {
+        a %= b;
+        // swap(a,b)
+        size_t tmp = a;
+        a = b;
+        b = tmp;
+    }
+
+    // a is now the GCD
+    numerator /= a;
+    denominator /= a;
+  }
+
+
+
+
+  struct ReduceConfig {
+  //has to match host-side ReduceConfig in the eager code
+  static constexpr int BLOCK_X = 0;
+  static constexpr int BLOCK_Y = 1;
+  static constexpr int CTA = 2;
+
+  static constexpr int input_vec_size = 4;
+  int element_size_bytes;
+  int num_inputs;
+  int num_outputs;
+  int step_input = 1;
+  int step_output = 1;
+  int ctas_per_output = 1;
+  int input_mult[3] = {0, 0, 0};
+  int output_mult[2] = {0, 0};
+
+  int block_width;
+  int block_height;
+  int num_threads;
+
+  bool vectorize_input = false;
+  int output_vec_size = 1;
+
+  C10_HOST_DEVICE bool should_block_x_reduce() const {
+    return input_mult[BLOCK_X] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_block_y_reduce() const {
+    return input_mult[BLOCK_Y] != 0;
+  }
+
+  C10_HOST_DEVICE bool should_global_reduce() const {
+    return input_mult[CTA] != 0;
+  }
+
+  C10_DEVICE bool should_store(int output_idx) const {
+    return output_idx < num_outputs &&
+      (!should_block_x_reduce() || threadIdx.x == 0) &&
+      (!should_block_y_reduce() || threadIdx.y == 0);
+  }
+
+  C10_DEVICE bool should_reduce_tail() const {
+    return (!should_block_y_reduce() || threadIdx.y == 0) &&
+      (!should_global_reduce() || blockIdx.y == 0);
+  }
+
+  C10_HOST_DEVICE int input_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta2 = blockIdx.y;
+    return (lane * input_mult[BLOCK_X] +
+            warp * input_mult[BLOCK_Y] +
+            cta2 * input_mult[CTA]);
+  }
+
+  template <int output_vec_size>
+  C10_HOST_DEVICE int output_idx() const {
+    int lane = threadIdx.x;
+    int warp = threadIdx.y;
+    int cta1 = blockIdx.x;
+    return (lane * output_mult[BLOCK_X] +
+            warp * output_mult[BLOCK_Y] +
+            cta1 * step_output) * output_vec_size;
+  }
+
+  C10_DEVICE int shared_memory_offset(int offset) const {
+    return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
+  }
+
+  C10_DEVICE int staging_memory_offset(int cta2) const {
+    int offset = cta2 + blockIdx.x * gridDim.y;
+    if (!should_block_x_reduce()) {
+      offset = threadIdx.x + offset * blockDim.x;
+    }
+    return offset;
+  }
+
+
+  };
+
+
+//TODO this will need to be different for more generic reduction functions
+namespace reducer {
+
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+
+  inline __device__ ${functor}
+
+  inline __device__ out_scalar_t project(arg_t arg) {
+    return (out_scalar_t) arg;
+  }
+
+  inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
+    return WARP_SHFL_DOWN(arg, offset);
+  }
+
+  inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
+    return acc;
+  }
+
+  // wrap a normal reduction that ignores the index
+  inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) {
+     return combine(acc, val);
+  }
+}
+
+
+struct ReduceJitOp {
+  using scalar_t = ${scalar_type};
+  using arg_t = ${reduction_accum_type};
+  using out_scalar_t = ${result_type};
+
+  using InputCalculator = OffsetCalculator<1>;
+  using OutputCalculator = OffsetCalculator<2>;
+
+//   static constexpr bool can_accumulate_in_output =
+//     std::is_convertible<arg_t, out_scalar_t>::value
+//     && std::is_convertible<out_scalar_t, arg_t>::value;
+
+  static constexpr int input_vec_size = ReduceConfig::input_vec_size;
+
+  arg_t ident;
+  ReduceConfig config;
+  InputCalculator input_calc;
+  OutputCalculator output_calc;
+  const void* src;
+  const char* dst[2]; //it accepts at most two destinations
+  // acc_buf used for accumulation among sub Tensor Iterator when accumulation on
+  // output is not permissible
+  void* acc_buf;
+  // cta_buf used for accumulation between blocks during global reduction
+  void* cta_buf;
+  int* semaphores;
+  int64_t base_idx;
+  bool accumulate;
+  bool final_output;
+  int noutputs;
+
+
+  C10_DEVICE void run() const {
+    extern __shared__ char shared_memory[];
+    uint32_t output_idx = config.output_idx<${output_vec_size}>();
+    uint32_t input_idx = config.input_idx();
+    auto base_offsets1 = output_calc.get(output_idx)[1];
+
+    using arg_vec_t = Array<arg_t, ${output_vec_size}>;
+    arg_vec_t value;
+
+    if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
+      const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
+
+      value = thread_reduce<${output_vec_size}>(input_slice);
+    }
+
+    if (config.should_block_y_reduce()) {
+      value = block_y_reduce<${output_vec_size}>(value, shared_memory);
+    }
+    if (config.should_block_x_reduce()) {
+      value = block_x_reduce<${output_vec_size}>(value, shared_memory);
+    }
+
+    using out_ptr_vec_t = Array<out_scalar_t*, ${output_vec_size}>;
+    using offset_vec_t = Array<uint32_t, ${output_vec_size}>;
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < ${output_vec_size}; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    arg_vec_t* acc = nullptr;
+    if (acc_buf != nullptr) {
+      size_t numerator = sizeof(arg_t);
+      size_t denominator = sizeof(out_scalar_t);
+      reduce_fraction(numerator, denominator);
+      acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
+    }
+
+    if (config.should_global_reduce()) {
+      value = global_reduce<${output_vec_size}>(value, acc, shared_memory);
+    } else if (config.should_store(output_idx)) {
+      if (accumulate) {
+        #pragma unroll
+        for (int i = 0; i < ${output_vec_size}; i++) {
+          value[i] = reducer::translate_idx(value[i], base_idx);
+        }
+      }
+
+      if (acc == nullptr) {
+        if (accumulate) {
+          value = accumulate_in_output<${output_vec_size}>(out, value);
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            *(out[i]) = get_accumulated_output(out[i], value[i]);
+          }
+        }
+      } else {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < ${output_vec_size}; i++) {
+            value[i] = reducer::combine((*acc)[i], value[i]);
+          }
+        }
+        if (final_output) {
+          set_results_to_output<${output_vec_size}>(value, base_offsets);
+        } else {
+          *acc = value;
+        }
+      }
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
+    if (config.vectorize_input) {
+      assert(output_vec_size == 1);
+      // reduce at the header of input_slice where memory is not aligned,
+      // so that thread_reduce will have an aligned memory to work on.
+      return {input_vectorized_thread_reduce_impl(data)};
+    } else {
+      uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
+      bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
+      if (is_contiguous) {
+        return thread_reduce_impl<output_vec_size>(data, [](uint32_t idx) { return idx; });
+      } else if (input_calc.dims == 1) {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return idx * element_stride; });
+      } else {
+        return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
+      }
+    }
+  }
+
+  C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
+    uint32_t end = config.num_inputs;
+
+    // Handle the head of input slice where data is not aligned
+    arg_t value = ident;
+    constexpr int align_bytes = alignof(aligned_vector<scalar_t, input_vec_size>);
+    constexpr int align_elements = align_bytes / sizeof(scalar_t);
+    int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t);
+    if (shift > 0) {
+      data -= shift;
+      end += shift;
+      if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
+        value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift);
+      }
+      end -= align_elements;
+      data += align_elements;
+      shift = align_elements - shift;
+    }
+
+    // Do the vectorized reduction
+    using load_t = aligned_vector<scalar_t, input_vec_size>;
+
+    uint32_t idx = config.input_idx();
+    const uint32_t stride = config.step_input;
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_t value_list[input_vec_size];
+    value_list[0] = value;
+
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[i] = ident;
+    }
+
+    scalar_t values[input_vec_size];
+
+    load_t *values_vector = reinterpret_cast<load_t*>(&values[0]);
+
+    while (idx * input_vec_size + input_vec_size - 1 < end) {
+      *values_vector = reinterpret_cast<const load_t*>(data)[idx];
+      #pragma unroll
+      for (uint32_t i = 0; i < input_vec_size; i++) {
+        value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i);
+      }
+      idx += stride;
+    }
+
+    // tail
+    uint32_t tail_start = end - end % input_vec_size;
+    if (config.should_reduce_tail()) {
+      int idx = tail_start + threadIdx.x;
+      if (idx < end) {
+        value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift);
+      }
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < input_vec_size; i++) {
+      value_list[0] = reducer::combine(value_list[0], value_list[i]);
+    }
+    return value_list[0];
+  }
+
+  template <int output_vec_size, typename offset_calc_t>
+  C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
+    uint32_t idx = config.input_idx();
+    const uint32_t end = config.num_inputs;
+    const uint32_t stride = config.step_input;
+    const int vt0=${vt0};
+
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using load_t = aligned_vector<scalar_t, output_vec_size>;
+    const load_t* data = reinterpret_cast<const load_t*>(data_);
+
+    // Multiple accumulators to remove dependency between unrolled loops.
+    arg_vec_t value_list[vt0];
+
+    #pragma unroll
+    for (int i = 0; i < vt0; i++) {
+      #pragma unroll
+      for (int j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = ident;
+      }
+    }
+
+    load_t values[vt0];
+
+    while (idx + (vt0 - 1) * stride < end) {
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        values[i] = data[calc(idx + i * stride) / output_vec_size];
+      }
+      #pragma unroll
+      for (uint32_t i = 0; i < vt0; i++) {
+        #pragma unroll
+        for (uint32_t j = 0; j < output_vec_size; j++) {
+          value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride);
+        }
+      }
+      idx += stride * vt0;
+    }
+
+    // tail
+    int idx_ = idx;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      values[i] = data[calc(idx) / output_vec_size];
+      idx += stride;
+    }
+    idx = idx_;
+    #pragma unroll
+    for (uint32_t i = 0; i < vt0; i++) {
+      if (idx >= end) {
+        break;
+      }
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx);
+      }
+      idx += stride;
+    }
+
+    // combine accumulators
+    #pragma unroll
+    for (int i = 1; i < vt0; i++) {
+      #pragma unroll
+      for (uint32_t j = 0; j < output_vec_size; j++) {
+        value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]);
+      }
+    }
+    return value_list[0];
+  }
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    int dim_x = blockDim.x;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    if (dim_x > warpSize) {
+      int address_base = threadIdx.x + threadIdx.y*blockDim.x;
+      shared[address_base] = value;
+      for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
+        __syncthreads();
+        if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
+          args_vec_t other = shared[address_base + offset];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], other[i]);
+          }
+          shared[address_base] = value;
+        }
+      }
+      dim_x = warpSize;
+    }
+
+    __syncthreads();
+
+    for (int offset = 1; offset < dim_x; offset <<= 1) {
+      #pragma unroll
+      for (int i = 0; i < output_vec_size; i++) {
+        arg_t other = reducer::warp_shfl_down(value[i], offset);
+        value[i] = reducer::combine(value[i], other);
+      }
+    }
+    return value;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
+    using args_vec_t = Array<arg_t, output_vec_size>;
+    args_vec_t* shared = (args_vec_t*)shared_memory;
+    shared[config.shared_memory_offset(0)] = value;
+    for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
+      __syncthreads();
+      if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
+        args_vec_t other = shared[config.shared_memory_offset(offset)];
+        #pragma unroll
+        for (int i = 0; i < output_vec_size; i++) {
+          value[i] = reducer::combine(value[i], other[i]);
+        }
+        shared[config.shared_memory_offset(0)] = value;
+      }
+    }
+    return value;
+  }
+  )ESCAPE";
+
+  const std::string reduction_template_1 = R"ESCAPE(
+
+  C10_DEVICE bool mark_block_finished() const {
+    __shared__ bool is_last_block_done_shared;
+
+    __syncthreads();
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+      int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
+      is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
+    }
+
+    __syncthreads();
+
+    return is_last_block_done_shared;
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
+    Array<out_scalar_t*, output_vec_size> out,
+    Array<arg_t, output_vec_size> value
+  ) const {
+    Array<arg_t, output_vec_size> ret;
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      ret[i] = reducer::combine(*(out[i]), value[i]);
+    }
+    return ret;
+  }
+
+
+  C10_DEVICE out_scalar_t get_accumulated_output(
+    out_scalar_t* out, arg_t value
+  ) const {
+    assert(!final_output);
+    return (out_scalar_t)value;
+  }
+
+  template<class T>
+  C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
+    assert(noutputs == 1);
+    auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
+    *res = x;
+  }
+
+//TODO - multi-output reduction - we won't be able to use thrust::pair
+//just explicitly specify typed output reads/writes
+//Currently implemented for max of two outputs
+//   template<class T1, class T2>
+//   C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
+//     if (noutputs >= 1) {
+//       auto res0 = (T1*)((char*)dst[0] + base_offset);
+//       *res0 = x.first;
+//     }
+//     if (noutputs >= 2) {
+//       // base offset is computed assuming element size being sizeof(T1), so we need to make a
+//       // correction to obtain the correct base offset
+//       auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
+//       *res1 = x.second;
+//     }
+//   }
+
+  template <int output_vec_size>
+  C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
+    assert(final_output);
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      set_results(reducer::project(value[i]), base_offset[i]);
+    }
+  }
+
+  template <int output_vec_size>
+  C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
+    using arg_vec_t = Array<arg_t, output_vec_size>;
+    using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
+    using offset_vec_t = Array<uint32_t, output_vec_size>;
+
+    arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
+    uint32_t output_idx = config.output_idx<output_vec_size>();
+    offset_vec_t base_offsets;
+    out_ptr_vec_t out;
+
+    #pragma unroll
+    for (int i = 0; i < output_vec_size; i++) {
+      base_offsets[i] = output_calc.get(output_idx + i)[0];
+      out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
+    }
+
+    bool should_store = config.should_store(output_idx);
+    if (should_store) {
+      uint32_t offset = config.staging_memory_offset(blockIdx.y);
+      reduce_buffer[offset] = value;
+    }
+
+    __threadfence(); // make sure writes are globally visible
+    __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
+    bool is_last_block_done = mark_block_finished();
+
+    if (is_last_block_done) {
+      __threadfence(); //complete acquire pattern
+      value = ident;
+      if (config.should_block_x_reduce()) {
+        uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
+        uint32_t step = blockDim.x * blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      } else {
+        uint32_t input_offset = threadIdx.y;
+        uint32_t step = blockDim.y;
+        for (; input_offset < config.ctas_per_output; input_offset += step) {
+          uint32_t idx = config.staging_memory_offset(input_offset);
+          arg_vec_t next = reduce_buffer[idx];
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::combine(value[i], next[i]);
+          }
+        }
+      }
+      value = block_y_reduce(value, shared_memory);
+      if (config.should_block_x_reduce()) {
+        value = block_x_reduce<output_vec_size>(value, shared_memory);
+      }
+      if (should_store) {
+        if (accumulate) {
+          #pragma unroll
+          for (int i = 0; i < output_vec_size; i++) {
+            value[i] = reducer::translate_idx(value[i], base_idx);
+          }
+        }
+
+        if (acc == nullptr) {
+          if (accumulate) {
+            value = accumulate_in_output<output_vec_size>(out, value);
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              *(out[i]) = get_accumulated_output(out[i], value[i]);
+            }
+          }
+        } else {
+          if (accumulate) {
+            #pragma unroll
+            for (int i = 0; i < output_vec_size; i++) {
+              value[i] = reducer::combine((*acc)[i], value[i]);
+            }
+          }
+          if (final_output) {
+            set_results_to_output<output_vec_size>(value, base_offsets);
+          } else {
+            *acc = value;
+          }
+        }
+      }
+    }
+
+    return value;
+  }
+};
+
+extern "C"
+__launch_bounds__(${max_threads_lb}, 4)
+__global__ void reduction_${name}_kernel(ReduceJitOp r){
+  r.run();
+}
+)ESCAPE";
+
+const std::string reduction_template = reduction_template_0 + reduction_template_1;
+
+
+const std::string &get_reduction_template() {
+  return reduction_template;
+}
+
+}}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/im2col_shape_check.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/im2col_shape_check.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a6fa47ba10f108d4437ff46bc9f4610e9f37dc4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/im2col_shape_check.h
@@ -0,0 +1,232 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/div_rtn.h>
+
+namespace at::native {
+
+inline void col2im_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int64_t output_height,
+    int64_t output_width,
+    int64_t kernel_height,
+    int64_t kernel_width,
+    int64_t dilation_height,
+    int64_t dilation_width,
+    int64_t pad_height,
+    int64_t pad_width,
+    int64_t stride_height,
+    int64_t stride_width) {
+  TORCH_CHECK(
+      kernel_width > 0 && kernel_height > 0,
+      "kernel size should be greater than zero, but got kernel_height: ",
+      kernel_height,
+      " kernel_width: ",
+      kernel_width);
+  TORCH_CHECK(
+      stride_width > 0 && stride_height > 0,
+      "stride should be greater than zero, but got stride_height: ",
+      stride_height,
+      " stride_width: ",
+      stride_width);
+  TORCH_CHECK(
+      dilation_width > 0 && dilation_height > 0,
+      "dilation should be greater than zero, but got dilation_height: ",
+      dilation_height,
+      " dilation_width: ",
+      dilation_width);
+  TORCH_CHECK(
+      pad_width >= 0 && pad_height >= 0,
+      "padding should be non-negative, but got pad_height: ",
+      pad_height,
+      " pad_width: ",
+      pad_width);
+
+
+  int64_t ndim = input.ndimension();
+  // allow dim=0 only the batch dimension.
+  TORCH_CHECK(
+      (ndim == 2 && input.size(0) != 0 && input.size(1) != 0) ||
+      (ndim == 3 && input.size(1) != 0 && input.size(2) != 0),
+      "Expected 2D or 3D (batch mode) tensor for input with possibly 0 batch size and non-zero dimensions for input, but got: ",
+      input.sizes());
+
+  int64_t batch_dim = (ndim == 3) ? 0 : -1;
+  int64_t n_input_plane = input.size(batch_dim + 1);
+
+  if (n_input_plane % (kernel_width * kernel_height) != 0) {
+    AT_ERROR(
+        "Expected size of input's dimension 1 to be divisible by the "
+        "product of kernel_size, but got input.size(1)=",
+        n_input_plane,
+        " and kernel_size=(",
+        kernel_height,
+        ", ",
+        kernel_width,
+        ").");
+  }
+
+  int64_t input_length = input.size(batch_dim + 2);
+  int64_t n_blocks_height =
+      div_rtn<int64_t>(
+          output_height + 2 * pad_height -
+              dilation_height * (kernel_height - 1) - 1,
+          stride_height) +
+      1;
+  int64_t n_blocks_width = div_rtn<int64_t>(
+                                   output_width + 2 * pad_width -
+                                       dilation_width * (kernel_width - 1) - 1,
+                                   stride_width) +
+      1;
+
+  if (input_length != (n_blocks_height * n_blocks_width)) {
+    AT_ERROR(
+        "Given output_size=(",
+        output_height,
+        ", ",
+        output_width,
+        "), kernel_size=(",
+        kernel_height,
+        ", ",
+        kernel_width,
+        "), dilation=(",
+        dilation_height,
+        ", ",
+        dilation_width,
+        "), padding=(",
+        pad_height,
+        ", ",
+        pad_width,
+        "), stride=(",
+        stride_height,
+        ", ",
+        stride_width,
+        "), expected size of input's dimension 2 to match the calculated number of ",
+        "sliding blocks ",
+        n_blocks_height,
+        " * ",
+        n_blocks_width,
+        " = ",
+        (n_blocks_height * n_blocks_width),
+        ", but got input.size(2)=",
+        input_length,
+        ".");
+  }
+
+  TORCH_CHECK(
+    n_blocks_height >= 1 && n_blocks_width >= 1,
+    "Given output_size=(", output_height, ", ", output_width, "), ",
+    "kernel_size=(", kernel_height, ", ", kernel_width, "), ",
+    "dilation=(", dilation_height, ", ", dilation_width, "), ",
+    "padding=(", pad_height, ", ", pad_width, "), ",
+    "stride=(", stride_height, ", ", stride_width, "), ",
+    "calculated shape of the array of sliding blocks as ",
+    "(", n_blocks_height, ", ", n_blocks_width, "), ",
+    "which is too small (non-positive)");
+
+  if (output_width < 1 || output_height < 1) {
+    AT_ERROR(
+        "Expected output spatial size to be positive, but got: output_size=(",
+        output_height,
+        ", ",
+        output_width,
+        ").");
+  }
+}
+
+inline void im2col_shape_check(
+    const Tensor& input,
+    const Tensor& grad_output,
+    int64_t kernel_height,
+    int64_t kernel_width,
+    int64_t dilation_height,
+    int64_t dilation_width,
+    int64_t pad_height,
+    int64_t pad_width,
+    int64_t stride_height,
+    int64_t stride_width) {
+  TORCH_CHECK(
+      kernel_width > 0 && kernel_height > 0,
+      "kernel size should be greater than zero, but got kernel_height: ",
+      kernel_height,
+      " kernel_width: ",
+      kernel_width);
+
+  TORCH_CHECK(
+      dilation_width > 0 && dilation_height > 0,
+      "dilation should be greater than zero, but got dilation_height: ",
+      dilation_height,
+      " dilation_width: ",
+      dilation_width);
+
+  TORCH_CHECK(
+      pad_width >= 0 && pad_height >= 0,
+      "padding should be non-negative, but got pad_height: ",
+      pad_height,
+      " pad_width: ",
+      pad_width);
+
+  TORCH_CHECK(
+      stride_width > 0 && stride_height > 0,
+      "stride should be greater than zero, but got stride_height: ",
+      stride_height,
+      " stride_width: ",
+      stride_width);
+
+  int64_t ndim = input.ndimension();
+
+  // allow dim=0 only the batch dimension.
+  bool valid_dims = input.size(1) != 0 && input.size(2) != 0;
+  TORCH_CHECK(
+      (ndim == 3 && input.size(0) && valid_dims) ||
+      (ndim == 4 && valid_dims && input.size(3) != 0),
+      "Expected 3D or 4D (batch mode) tensor with possibly 0 batch size and other non-zero dimensions for input, but got: ",
+      input.sizes());
+
+  int64_t dim_batch = 0;
+
+  if (ndim == 3) {
+    dim_batch = -1;
+  }
+
+  int64_t input_height = input.size(dim_batch + 2);
+  int64_t input_width = input.size(dim_batch + 3);
+  int64_t output_height = div_rtn<int64_t>(
+                              input_height + 2 * pad_height -
+                                  (dilation_height * (kernel_height - 1) + 1),
+                              stride_height) +
+      1;
+  int64_t output_width = div_rtn<int64_t>(
+                             input_width + 2 * pad_width -
+                                 (dilation_width * (kernel_width - 1) + 1),
+                             stride_width) +
+      1;
+
+  if (output_height < 1 || output_width < 1) {
+    AT_ERROR(
+        "Given input with spatial size (",
+        input_height,
+        ", ",
+        input_height,
+        "), kernel_size=(",
+        kernel_height,
+        ", ",
+        kernel_width,
+        "), dilation=(",
+        dilation_height,
+        ", ",
+        dilation_width,
+        "), padding=(",
+        pad_height,
+        ", ",
+        pad_width,
+        "), calculated shape of the array of sliding blocks as (",
+        output_height,
+        ", ",
+        output_width,
+        "), but its components must be at least one.");
+  }
+}
+
+} // namespace at::native