diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/EmptyTensor.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/EmptyTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..39b206cb3031d7fbdcc704c86453791adf81e15e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/EmptyTensor.h
@@ -0,0 +1,29 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <ATen/core/TensorBase.h>
+
+namespace at::detail {
+
+C10_EXPORT TensorBase empty_mps(
+    IntArrayRef size,
+    std::optional<ScalarType> dtype_opt,
+    std::optional<Layout> layout_opt,
+    std::optional<Device> device_opt,
+    std::optional<bool> pin_memory_opt,
+    std::optional<c10::MemoryFormat> memory_format_opt);
+C10_EXPORT TensorBase empty_mps(
+    IntArrayRef size, const TensorOptions &options);
+
+C10_EXPORT TensorBase empty_strided_mps(
+    IntArrayRef size,
+    IntArrayRef stride,
+    ScalarType dtype,
+    std::optional<Device> device_opt);
+
+C10_EXPORT TensorBase empty_strided_mps(
+    IntArrayRef size,
+    IntArrayRef stride,
+    const TensorOptions &options);
+
+} // namespace at::detail
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/IndexKernels.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/IndexKernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..21b639da3e48744608061bdccb472223e89368f7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/IndexKernels.h
@@ -0,0 +1,535 @@
+#pragma once
+
+namespace at::mps {
+
+static const char * indexing_metal_shaders = R"INDEX_METAL(
+#include <metal_stdlib>
+#include <metal_atomic>
+
+using namespace metal;
+
+struct IndexAB {
+    constant int64_t* indexArray;
+};
+
+template<typename T, typename OffsetsT>
+kernel void index_select(
+    constant IndexAB  * indexAB           [[buffer(0)]],
+    constant void     * indexSizes        [[buffer(1)]],
+    constant void     * indexStrides      [[buffer(2)]],
+    constant OffsetsT * offsets           [[buffer(3)]],
+    constant void     * inputData         [[buffer(4)]],
+    device   void     * outputData        [[buffer(5)]],
+    constant uint32_t & num_indices       [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+    int64_t offset = 0;
+    for (uint32_t i = 0; i < num_indices; i++) {
+        constant int64_t* indexArray = indexAB[i].indexArray;
+        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        if (index < 0) {
+            index += index_sizes[i];
+        }
+        offset += index * index_strides[i];
+     }
+    device T * out = (device T*)((device char*)outputData + offsets[thread_index].x);
+    constant T * in  = (constant T*)((constant char*)inputData  + offsets[thread_index].y + offset);
+    *out = *in;
+}
+
+template<typename T, typename OffsetsT>
+void index_put_impl(
+    constant IndexAB  * indexAB,
+    constant int64_t  * index_sizes,
+    constant int64_t  * index_strides,
+    constant OffsetsT * offsets,
+    constant void     * inputData,
+    device   void     * outputData,
+    constant uint32_t & num_indices,
+    uint thread_index) {
+    int64_t offset = 0;
+    for (uint32_t i = 0; i < num_indices; i++) {
+        constant int64_t* indexArray = indexAB[i].indexArray;
+        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+
+        if (index < 0) {
+            index += index_sizes[i];
+        }
+        offset += index * index_strides[i];
+    }
+    device T * out = (device T*)((device char*)outputData + offsets[thread_index].x + offset);
+    constant T * in  = (constant T*)((constant char*)inputData  + offsets[thread_index].y);
+    *out = *in;
+}
+
+template<typename T, typename OffsetsT>
+kernel void index_put_serial(
+    constant IndexAB  * indexAB           [[buffer(0)]],
+    constant void     * indexSizes        [[buffer(1)]],
+    constant void     * indexStrides      [[buffer(2)]],
+    constant OffsetsT * offsets           [[buffer(3)]],
+    constant void     * inputData         [[buffer(4)]],
+    device   void     * outputData        [[buffer(5)]],
+    constant uint32_t & num_indices       [[buffer(6)]],
+    constant uint     * numIters          [[buffer(7)]],
+    uint thread_index [[thread_position_in_grid]]) {
+
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+
+    for (uint iter_i = 0; iter_i < *numIters; iter_i++) {
+        index_put_impl<T>(indexAB, index_sizes, index_strides, offsets, inputData, outputData, num_indices, iter_i);
+    }
+}
+
+template<typename T, typename OffsetsT>
+kernel void index_put(
+    constant IndexAB  * indexAB           [[buffer(0)]],
+    constant void     * indexSizes        [[buffer(1)]],
+    constant void     * indexStrides      [[buffer(2)]],
+    constant OffsetsT * offsets           [[buffer(3)]],
+    constant void     * inputData         [[buffer(4)]],
+    device   void     * outputData        [[buffer(5)]],
+    constant uint32_t & num_indices       [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+    index_put_impl<T>(indexAB, index_sizes, index_strides, offsets, inputData, outputData, num_indices, thread_index);
+}
+
+#define REGISTER_INDEX_OP(DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE)   \
+template                                                                           \
+[[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE "_" #IDX_SIZE)]]               \
+kernel void index_ ## INDEX_OP_TYPE<DTYPE, IDX_DTYPE>(                             \
+    constant IndexAB * indexAB           [[buffer(0)]],                            \
+    constant void    * indexSizes        [[buffer(1)]],                            \
+    constant void    * indexStrides      [[buffer(2)]],                            \
+    constant IDX_DTYPE   * offsets           [[buffer(3)]],                        \
+    constant void    * inputData         [[buffer(4)]],                            \
+    device   void    * outputData        [[buffer(5)]],                            \
+    constant uint32_t & num_indices      [[buffer(6)]],                            \
+    uint thread_index [[thread_position_in_grid]]);
+
+#define REGISTER_INDEX_OP_ALL_DTYPES(INDEX_OP_TYPE)     \
+    REGISTER_INDEX_OP(8bit,  idx32, char,  INDEX_OP_TYPE, uint3);     \
+    REGISTER_INDEX_OP(8bit,  idx64, char,  INDEX_OP_TYPE, ulong3);    \
+    REGISTER_INDEX_OP(16bit, idx32, short, INDEX_OP_TYPE, uint3);     \
+    REGISTER_INDEX_OP(16bit, idx64, short, INDEX_OP_TYPE, ulong3);    \
+    REGISTER_INDEX_OP(32bit, idx32, int,   INDEX_OP_TYPE, uint3);     \
+    REGISTER_INDEX_OP(32bit, idx64, int,   INDEX_OP_TYPE, ulong3);    \
+    REGISTER_INDEX_OP(64bit, idx32, long,  INDEX_OP_TYPE, uint3);     \
+    REGISTER_INDEX_OP(64bit, idx64, long,  INDEX_OP_TYPE, ulong3);
+
+REGISTER_INDEX_OP_ALL_DTYPES(select);
+REGISTER_INDEX_OP_ALL_DTYPES(put);
+
+#define REGISTER_SINGLE_THREADED_INDEX_OP(DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE)   \
+template                                                                                           \
+[[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE "_" #IDX_SIZE)]]                               \
+kernel void index_ ## INDEX_OP_TYPE<DTYPE, IDX_DTYPE>(                                             \
+    constant IndexAB   * indexAB           [[buffer(0)]],                                          \
+    constant void      * indexSizes        [[buffer(1)]],                                          \
+    constant void      * indexStrides      [[buffer(2)]],                                          \
+    constant IDX_DTYPE * offsets           [[buffer(3)]],                                          \
+    constant void      * inputData         [[buffer(4)]],                                          \
+    device   void      * outputData        [[buffer(5)]],                                          \
+    constant uint32_t  & num_indices       [[buffer(6)]],                                          \
+    constant uint      * numIters          [[buffer(7)]],                                          \
+    uint thread_index [[thread_position_in_grid]]);
+
+#define REGISTER_SINGLE_THREADED_INDEX_OP_ALL_DTYPES(INDEX_OP_TYPE)                   \
+    REGISTER_SINGLE_THREADED_INDEX_OP(8bit,  idx32, char,  INDEX_OP_TYPE, uint3);     \
+    REGISTER_SINGLE_THREADED_INDEX_OP(8bit,  idx64, char,  INDEX_OP_TYPE, ulong3);    \
+    REGISTER_SINGLE_THREADED_INDEX_OP(16bit, idx32, short, INDEX_OP_TYPE, uint3);     \
+    REGISTER_SINGLE_THREADED_INDEX_OP(16bit, idx64, short, INDEX_OP_TYPE, ulong3);    \
+    REGISTER_SINGLE_THREADED_INDEX_OP(32bit, idx32, int,   INDEX_OP_TYPE, uint3);     \
+    REGISTER_SINGLE_THREADED_INDEX_OP(32bit, idx64, int,   INDEX_OP_TYPE, ulong3);    \
+    REGISTER_SINGLE_THREADED_INDEX_OP(64bit, idx32, long,  INDEX_OP_TYPE, uint3);     \
+    REGISTER_SINGLE_THREADED_INDEX_OP(64bit, idx64, long,  INDEX_OP_TYPE, ulong3);
+
+REGISTER_SINGLE_THREADED_INDEX_OP_ALL_DTYPES(put_serial);
+
+template<typename StridesT, typename DataT>
+kernel void kernel_index_offsets(constant StridesT * strides         [[buffer(0)]],
+                                device DataT      * data_offsets    [[buffer(1)]],
+                                constant uint     * iter_shape      [[buffer(2)]],
+                                constant uint     & num_dimensions  [[buffer(3)]],
+                                uint thread_index [[thread_position_in_grid]]) {
+    data_offsets[thread_index] = 0;
+    uint32_t idx = thread_index;
+    for (uint32_t dim = 0; dim < num_dimensions; dim++) {
+        uint32_t remainder = idx % iter_shape[dim];
+        idx /= iter_shape[dim];
+
+        data_offsets[thread_index] += remainder * DataT(strides[dim]);
+    }
+}
+
+template
+[[host_name("kernel_index_offsets_32")]]
+kernel void kernel_index_offsets<packed_uint3, uint3>(
+                constant packed_uint3 * strides         [[buffer(0)]],
+                device uint3          * data_offsets    [[buffer(1)]],
+                constant uint         * iter_shape      [[buffer(2)]],
+                constant uint         & num_dimensions  [[buffer(3)]],
+                uint thread_index [[thread_position_in_grid]]);
+
+template
+[[host_name("kernel_index_offsets_64")]]
+kernel void kernel_index_offsets<packed_uint3, ulong3>(
+                constant packed_uint3 * strides         [[buffer(0)]],
+                device ulong3          * data_offsets    [[buffer(1)]],
+                constant uint         * iter_shape      [[buffer(2)]],
+                constant uint         & num_dimensions  [[buffer(3)]],
+                uint thread_index [[thread_position_in_grid]]);
+
+template<typename T, typename E, typename OffsetsT>
+kernel void index_put_accumulate_native_dtypes(
+    constant IndexAB  * indexAB     [[buffer(0)]],
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant OffsetsT * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device void       * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices  [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+    int64_t offset = 0;
+    for (uint32_t i = 0; i < num_indices; i++) {
+        constant int64_t* indexArray = indexAB[i].indexArray;
+        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        if (index < 0) {
+            index += index_sizes[i];
+        }
+        offset += index * index_strides[i];
+    }
+    device T * out = (device T*)((device char*)outputData + offsets[thread_index].x + offset);
+    constant E * in  = (constant E*)((constant char*)inputData  + offsets[thread_index].y);
+    atomic_fetch_add_explicit(out, *in, memory_order_relaxed);
+}
+
+template<typename T>
+__attribute__((__always_inline__)) void atomic_fetch_add_relaxed(device void * addr, T value) {
+    device atomic_uint* uintAddr = (device atomic_uint*)addr;
+    uint expected = atomic_load_explicit(uintAddr, memory_order_relaxed);
+    T updated = as_type<T>(expected) + value;
+    while (!atomic_compare_exchange_weak_explicit(uintAddr, &expected, as_type<uint>(updated), memory_order_relaxed, memory_order_relaxed)) {
+        updated = as_type<T>(expected) + value;
+    }
+}
+
+template<typename T, typename OffsetsT>
+kernel void atomic_index_put_accumulate(
+    constant IndexAB  * indexAB           [[buffer(0)]],
+    constant void     * indexSizes        [[buffer(1)]],
+    constant void     * indexStrides      [[buffer(2)]],
+    constant OffsetsT * offsets           [[buffer(3)]],
+    constant void     * inputData         [[buffer(4)]],
+    device   void     * outputData        [[buffer(5)]],
+    constant uint32_t & num_indices       [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]) {
+    constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
+    constant int64_t * index_strides = (constant int64_t *)indexStrides;
+    int64_t offset = 0;
+    for (uint32_t i = 0; i < num_indices; i++) {
+        constant int64_t* indexArray = indexAB[i].indexArray;
+        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        if (index < 0) {
+            index += index_sizes[i];
+        }
+        offset += index * index_strides[i];
+    }
+    device void * out = (device void*)((device char*)outputData + offsets[thread_index].x + offset);
+    constant T  * in  = (constant T*)((constant char*)inputData + offsets[thread_index].y);
+    atomic_fetch_add_relaxed<T>(out, *in);
+}
+
+template
+[[host_name("index_put_accumulate_32bit_float_idx32")]]
+kernel void atomic_index_put_accumulate<float, uint3>(
+    constant IndexAB  * indexAB     [[buffer(0)]],
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant uint3    * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device   void     * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices  [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]);
+
+template
+[[host_name("index_put_accumulate_32bit_float_idx64")]]
+kernel void atomic_index_put_accumulate<float, ulong3>(
+    constant IndexAB  * indexAB     [[buffer(0)]],
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant ulong3   * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device   void     * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices  [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]);
+
+template
+[[host_name("index_put_accumulate_32bit_int_idx32")]]
+kernel void index_put_accumulate_native_dtypes<atomic_int, int, uint3>(
+    constant IndexAB  * indexAB     [[buffer(0)]],
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant uint3    * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device   void     * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]);
+
+template
+[[host_name("index_put_accumulate_32bit_int_idx64")]]
+kernel void index_put_accumulate_native_dtypes<atomic_int, int, ulong3>(
+    constant IndexAB  * indexAB     [[buffer(0)]],
+    constant void     * indexSizes   [[buffer(1)]],
+    constant void     * indexStrides [[buffer(2)]],
+    constant ulong3   * offsets      [[buffer(3)]],
+    constant void     * inputData    [[buffer(4)]],
+    device   void     * outputData   [[buffer(5)]],
+    constant uint32_t & num_indices [[buffer(6)]],
+    uint thread_index [[thread_position_in_grid]]);
+)INDEX_METAL";
+
+static const char *SCATTER_OPS_TEMPLATE = R"METAL_SCATTER(
+struct __attribute__ ((packed)) packed_uint5{{
+  uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
+}};
+
+template<typename Y, typename X>
+Y cast(const X x);
+
+template<>
+{1} cast<{1}, {0}>(const {0} x) {{
+ return {2};
+}}
+
+kernel void scatter_kernel_5(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint5 & size   [[buffer(2)]],
+                             constant packed_uint5 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint5 local_index;
+    local_index.x = linear_index / (size.u * size.w * size.z * size.y) % size.x;
+    local_index.y = linear_index / (size.u * size.w * size.z) % size.y;
+    local_index.z = linear_index / (size.u * size.w) % size.z;
+    local_index.w = linear_index / size.u % size.w;
+    local_index.u = linear_index % size.u;
+
+    packed_uint5 strided_index;
+    strided_index.x = local_index.x * stride.x;
+    strided_index.y = local_index.y * stride.y;
+    strided_index.z = local_index.z * stride.z;
+    strided_index.w = local_index.w * stride.w;
+    strided_index.u = local_index.u * stride.u;
+
+    dst[strided_index.x + strided_index.y + strided_index.z + strided_index.w + strided_index.u] = cast<{1}>(src[linear_index]);
+}}
+
+kernel void scatter_kernel_4(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint4 & size   [[buffer(2)]],
+                             constant packed_uint4 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint4 local_index;
+    local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0];
+    local_index.y = linear_index / (size[3] * size[2]) % size[1];
+    local_index.z = linear_index / size[3] % size[2];
+    local_index.w = linear_index % size[3];
+
+    const packed_uint4 strided_index = local_index * stride;
+    dst[strided_index.x + strided_index.y + strided_index.z + strided_index.w] = cast<{1}>(src[linear_index]);
+}}
+
+kernel void scatter_kernel_3(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint3 & size   [[buffer(2)]],
+                             constant packed_uint3 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint3 local_index;
+    local_index.x = linear_index / (size[2] * size[1]) % size[0];
+    local_index.y = linear_index / size[2] % size[1];
+    local_index.z = linear_index % size[2];
+
+    const packed_uint3 strided_index = local_index * stride;
+    dst[strided_index.x + strided_index.y + strided_index.z] = cast<{1}>(src[linear_index]);
+}}
+
+kernel void scatter_kernel_2(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant packed_uint2 & size   [[buffer(2)]],
+                             constant packed_uint2 & stride [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint2 local_index;
+    local_index.x = linear_index / size[1] % size[0];
+    local_index.y = linear_index % size[1];
+
+    const packed_uint2 strided_index = local_index * stride;
+    dst[strided_index.x + strided_index.y] = cast<{1}>(src[linear_index]);
+}}
+
+kernel void scatter_kernel_1(uint linear_index              [[thread_position_in_grid]],
+                             constant void * src_           [[buffer(0)]],
+                             device void * dst_             [[buffer(1)]],
+                             constant int & size            [[buffer(2)]],
+                             constant int & stride          [[buffer(3)]],
+                             constant uint32_t & numel      [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    const int local_index = linear_index % size;
+    const int strided_index = local_index * stride;
+    dst[strided_index] = cast<{1}>(src[linear_index]);
+}}
+)METAL_SCATTER";
+
+static const char *GATHER_OPS_TEMPLATE = R"METAL_GATHER(
+struct __attribute__ ((packed)) packed_uint5{{
+  uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u;
+}};
+
+template<typename Y, typename X>
+Y cast(const X x);
+
+template<>
+{1} cast<{1}, {0}>(const {0} x) {{
+ return {2};
+}}
+
+kernel void gather_kernel_5(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint5 & size    [[buffer(2)]],
+                            constant packed_uint5 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+
+    packed_uint5 local_index;
+    local_index.x = linear_index / (size.u * size.w * size.z * size.y) % size.x;
+    local_index.y = linear_index / (size.u * size.w * size.z) % size.y;
+    local_index.z = linear_index / (size.u * size.w) % size.z;
+    local_index.w = linear_index / size.u % size.w;
+    local_index.u = linear_index % size.u;
+
+    packed_uint5 strided_index;
+    strided_index.x = local_index.x * stride.x;
+    strided_index.y = local_index.y * stride.y;
+    strided_index.z = local_index.z * stride.z;
+    strided_index.w = local_index.w * stride.w;
+    strided_index.u = local_index.u * stride.u;
+
+    dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z + strided_index.w + strided_index.u]);
+}}
+
+kernel void gather_kernel_4(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint4 & size    [[buffer(2)]],
+                            constant packed_uint4 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint4 local_index;
+    local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0];
+    local_index.y = linear_index / (size[3] * size[2]) % size[1];
+    local_index.z = linear_index / size[3] % size[2];
+    local_index.w = linear_index % size[3];
+
+    const packed_uint4 strided_index = local_index * stride;
+    dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z + strided_index.w]);
+}}
+
+kernel void gather_kernel_3(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint3 & size    [[buffer(2)]],
+                            constant packed_uint3 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint3 local_index;
+    local_index.x = linear_index / (size[2] * size[1]) % size[0];
+    local_index.y = linear_index / size[2] % size[1];
+    local_index.z = linear_index % size[2];
+
+    const packed_uint3 strided_index = local_index * stride;
+    dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z]);
+}}
+
+kernel void gather_kernel_2(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant packed_uint2 & size    [[buffer(2)]],
+                            constant packed_uint2 & stride  [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    packed_uint2 local_index;
+    local_index.x = linear_index / size[1] % size[0];
+    local_index.y = linear_index % size[1];
+
+    const packed_uint2 strided_index = local_index * stride;
+    dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y]);
+}}
+
+kernel void gather_kernel_1(uint linear_index               [[thread_position_in_grid]],
+                            constant void * src_            [[buffer(0)]],
+                            device void * dst_              [[buffer(1)]],
+                            constant int & size             [[buffer(2)]],
+                            constant int & stride           [[buffer(3)]],
+                            constant uint32_t & numel       [[buffer(4)]]) {{
+    if (linear_index >= numel) return;
+
+    constant {0} * src = (constant {0} *)src_;
+    device {1} * dst = (device {1} *)dst_;
+
+    const int local_index = linear_index % size;
+    const int strided_index = local_index * stride;
+    dst[linear_index] = cast<{1}>(src[strided_index]);
+}}
+)METAL_GATHER";
+} // namespace at::mps
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dc8c434f85b76c47ec5b8d7355a64ad3e3254ed
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h
@@ -0,0 +1,403 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/mps/MPSAllocatorInterface.h>
+#include <ATen/mps/MPSEvent.h>
+#include <ATen/mps/MPSStream.h>
+
+#include <cstdio>
+#include <mutex>
+#include <set>
+#include <unordered_set>
+#include <mach/vm_page_size.h>
+#include <c10/util/flat_hash_map.h>
+
+// this implementation is based on CUDACachingAllocator.
+// It utilizes Metal Heaps to improve the performance with buffer allocation.
+// Do not include this header. Use MPSAllocatorInterface.h instead.
+// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
+namespace at::mps::HeapAllocator {
+
+static const size_t kMaxSmallAlloc = MB(1);    // largest "small" allocation is 1 MiB
+static const size_t kMinLargeAlloc = MB(10);   // allocations between 1 and 10 MiB may use kLargeHeap
+static const size_t kRoundLarge    = MB(2);    // round up large allocations to 2 MiB
+static const size_t kSmallHeap     = MB(8);    // "small" allocations are packed in 8 MiB heaps
+static const size_t kLargeHeap     = MB(32);   // "large" allocations may be packed in 32 MiB heaps
+static const size_t kXLargeHeapD   = MB(128);  // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
+static const size_t kXLargeHeapU   = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
+static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation
+
+// buffer pools could be customized with a combination of usage flags
+enum UsageFlags : uint32_t {
+  PRIVATE = 0,
+  SMALL   = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
+  SHARED  = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
+  MANAGED = (1 << 2), // managed storage mode
+  HAZARD  = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
+  SCALAR  = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
+};
+// debug verbosity flags
+enum DebugVerbosity : uint32_t {
+  SILENT      = 0,
+  PROFILING   = (1 << 0), // print generic profiling data for total system memory usage
+  ALLOCATIONS = (1 << 1), // print buffer allocations
+  RECYCLES    = (1 << 2), // print buffer recycling
+  RELEASES    = (1 << 3), // print buffer releases
+  LARGE_ONLY  = (1 << 4), // only log large buffer pool transactions
+};
+
+struct HeapBlock;
+
+struct BufferBlock {
+  id<MTLBuffer> buffer;
+  void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer
+  size_t size; // size after alignment
+  size_t requested_size; // requested size (before alignment)
+  // buffer shape is used for retrieving base of views in cached graphs
+  std::vector<int64_t> shape;
+  bool in_use = false;
+  HeapBlock* heap;
+  id_t buf_id;
+  // counter to candidate least recently used buffers for garbage collection
+  uint32_t gc_count = 0;
+  uint32_t use_count = 0;
+  // counter to assign unique ids to buffer blocks
+  static uint64_t buffer_counter;
+  // Metal events used to sync GPU/CPU operations on the shared-storage buffers
+  MPSEventPtr event;
+
+  BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr,
+              HeapBlock* Heap = nullptr) :
+              buffer(Buffer), size(Size), requested_size(RequestedSize),
+              heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { }
+
+  static bool Comparator(const BufferBlock* a, const BufferBlock* b) {
+    return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer;
+  }
+  static size_t alignUp(size_t Size, size_t Alignment) {
+    assert(((Alignment - 1) & Alignment) == 0);
+    return ((Size + Alignment - 1) & ~(Alignment - 1));
+  }
+  uint32_t retainCount() const { return [buffer retainCount]; }
+};
+typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);
+
+struct BufferPool;
+struct AllocParams {
+  AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) :
+              search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { }
+  size_t size() const { return search_key.size; }
+
+  BufferBlock search_key;
+  BufferPool* pool;
+  BufferBlock* buffer_block = nullptr;
+  size_t requested_size;
+  // true if we exceed the low watermark limit. In this case
+  // we apply strategies to relieve the pressure before allocation.
+  bool has_memory_pressure = false;
+  // true if we're allocating on a unified memory device
+  bool has_unified_memory = true;
+};
+
+struct HeapBlock {
+  id<MTLHeap> heap;
+  struct { size_t total, available; } size;
+  BufferPool* pool;
+  unsigned int n_buffers = 0;
+  id_t heap_id;
+  // indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer)
+  bool is_split;
+  // counter to assign unique ids to heap blocks
+  static uint64_t heap_counter;
+
+  HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool *Pool = nullptr) :
+            heap(Heap), size({.total = Size, .available = Size}), pool(Pool),
+            heap_id(Heap ? ++heap_counter : 0), is_split(true) { }
+
+  static MTLResourceOptions getOptions(uint32_t usage) {
+    // TODO: check the caching performance of write-combined mode
+    MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache;
+
+    if (usage & UsageFlags::MANAGED)
+      options |= MTLResourceStorageModeManaged;
+    else if (usage & UsageFlags::SHARED)
+      options |= MTLResourceStorageModeShared;
+    else
+      options |= MTLResourceStorageModePrivate;
+
+    options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
+
+    return options;
+  }
+
+  static HeapBlock* createHeapBlock(AllocParams& params, id<MTLDevice> device, uint32_t usage) {
+    HeapBlock *heapBlock = nullptr;
+    bool is_split = true;
+    const size_t size = params.size();
+    MTLHeapDescriptor *d = [MTLHeapDescriptor new];
+    if (d) {
+      const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD;
+      if (size <= kMaxSmallAlloc) {
+        d.size = kSmallHeap;
+      } else if (size < kMinLargeAlloc) {
+        d.size = kLargeHeap;
+      } else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) {
+        d.size = kXLargeHeap;
+      } else {
+        d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
+        is_split = false;
+      }
+      d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate;
+      d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+      // this automatically handles Metal buffer access synchronizations at the
+      // cost of slightly lower performance.
+      d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
+      d.resourceOptions = getOptions(usage);
+      d.type = MTLHeapTypeAutomatic;
+      id<MTLHeap> heap = [device newHeapWithDescriptor: d];
+      if (heap) {
+        [heap setPurgeableState:MTLPurgeableStateNonVolatile];
+        const size_t heap_size = heapAvailableSize(heap);
+        heapBlock = new HeapBlock(heap_size, heap, params.pool);
+        if (heapBlock) {
+          heapBlock->is_split = is_split;
+        }
+      }
+      [d release];
+    }
+    return heapBlock;
+  }
+  static bool Comparator(const HeapBlock* a, const HeapBlock* b) {
+    return (a->size.available != b->size.available) ? a->size.available < b->size.available :
+                                                      (uintptr_t)a->heap < (uintptr_t)b->heap;
+  }
+  static NSUInteger heapAvailableSize(id<MTLHeap> heap, size_t Alignment = vm_page_size) {
+    return [heap maxAvailableSizeWithAlignment:Alignment];
+  }
+  NSUInteger Size() {
+    return [heap size];
+  }
+  id<MTLBuffer> newMTLBuffer(size_t length, uint32_t usage) {
+    id<MTLBuffer> buf = [heap newBufferWithLength:length options:getOptions(usage)];
+    if (buf) {
+      updateAvailableSize();
+      n_buffers++;
+    }
+    return buf;
+  }
+  // returns the retainCount before releasing the buffer
+  uint32_t releaseMTLBuffer(id<MTLBuffer>& buffer) {
+    const uint32_t retainCount = [buffer retainCount];
+    [buffer release];
+    buffer = nil;
+    updateAvailableSize();
+    n_buffers--;
+    return retainCount;
+  }
+  // returns the retainCount before releasing the heap
+  uint32_t releaseMTLHeap() {
+    const uint32_t retainCount = [heap retainCount];
+    TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty
+    [heap setPurgeableState:MTLPurgeableStateEmpty];
+    [heap release];
+    heap = nil;
+    size.available = 0;
+    return retainCount;
+  }
+  uint32_t retainCount() const { return [heap retainCount]; }
+  void updateAvailableSize() { size.available = heapAvailableSize(heap); }
+};
+typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);
+
+struct BufferPool {
+  enum class Kind {
+    PRIVATE_SMALL,
+    PRIVATE_LARGE,
+    SHARED_SMALL,
+    SHARED_LARGE,
+    SCALAR,
+  };
+
+  BufferPool(const id<MTLDevice> Device, uint32_t Usage) :
+             device(Device), usage(Usage),
+             heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { }
+
+  const id<MTLDevice> device;
+  // usage flags to customize the pool for various purposes (see UsageFlags enum)
+  const uint32_t usage;
+  // total number of buffers in the pool
+  uint32_t n_buffers = 0;
+  // total allocations size on this pool
+  size_t allocated_size = 0;
+  // total memory available in the pool
+  size_t available_size = 0;
+  // list of heaps ordered by their "available" (not total) memory size
+  std::set<HeapBlock*, HeapComparison> heaps;
+  // list of only "available" buffers in the pool (i.e., buffers not in-use)
+  std::set<BufferBlock*, BufferComparison> available_buffers;
+  // list of buffers that are in a state of "limbo" where they've already been freed
+  // from PyTorch-side, but were not returned to pool due to still being
+  // in-use by command buffers with retainCount > 1. In this state, the buffer is
+  // neither ready to be recycled, nor could be returned to pool as available.
+  // These buffers will be returned to pool once the command buffer's
+  // completionHandler callbacks are called.
+  std::unordered_set<BufferBlock*> buffers_pending_free;
+  // list of heaps pending size update
+  std::unordered_set<HeapBlock*> heaps_pending_update;
+};
+
+class MPSHeapAllocatorImpl {
+public:
+  explicit MPSHeapAllocatorImpl() :
+    m_device(at::mps::MPSDevice::getInstance()->device()),
+    m_max_buffer_size([m_device maxBufferLength]),
+    m_stream(getDefaultMPSStream()),
+    m_event_pool(getMPSEventPool()) {
+    init_allocator();
+  }
+  ~MPSHeapAllocatorImpl() {
+    emptyCache();
+  }
+  // interface exposed to at::Allocator
+  id<MTLBuffer> malloc(size_t size, uint32_t usage);
+  // frees a buffer and returns it into buffer pool
+  void free(void* ptr);
+  // releases all the cached buffers and their associated heaps
+  void emptyCache();
+  // free inactive buffers that are pending to be freed
+  void freeInactiveBuffers();
+  // returns true if buffer was allocated from the shared pool
+  bool isSharedBuffer(const void* ptr);
+  // get the requested unaligned size of an MTLBuffer
+  ssize_t getUnalignedBufferSize(const void* ptr);
+  // set the shape of a base tensor from a view tensor
+  void setBufferShape(const void* ptr, const IntArrayRef& shape);
+  // retrieve the shape of a base tensor from a view tensor
+  IntArrayRef getBufferShape(const void* ptr);
+  // get the unique ID of the buffer
+  id_t getBufferId(const void* ptr);
+  // allocate a buffer from a specialized pool to import CPU scalars into GPU
+  id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
+  // returns a CPU-mapping of the input buffer and its retainCount,
+  // if only it has Shared storage-mode and allocated on MPSAllocator
+  std::pair<const void*, uint32_t> getSharedBufferPtr(const void* buffer);
+  // records events for a list of MTLBuffers (list is used to lock the mutex once)
+  // returns true if records any event (given if passed buffers exist and are shared-storage)
+  bool recordEvents(c10::ArrayRef<const void*> buffers);
+  // waits for the event to signal the completion of GPU execution
+  // on the passed shared buffers (list is used to lock the mutex once)
+  // returns true if actually waited on any event
+  bool waitForEvents(c10::ArrayRef<const void*> buffers);
+  // this indicates how far (in Megabytes) the current total allocations are from the
+  // low watermark limit which is used to detect if we're under memory pressure
+  // This returns zero if we've reached the low watermark limit
+  ssize_t getLowWatermarkValue();
+  // (see m_low_watermark_ratio for description)
+  void setLowWatermarkRatio(double ratio);
+  // (see m_high_watermark_ratio for description)
+  void setHighWatermarkRatio(double ratio);
+  // (see m_low_watermark_limit for description)
+  size_t getLowWatermarkLimit() const { return m_low_watermark_limit; }
+  // (see m_max_total_allowed_size for description)
+  size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; }
+  // (see m_total_allocated_memory for description)
+  size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; }
+  // (see m_current_allocated_memory for description)
+  size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; }
+  // total GPU memory allocated in the process by Metal driver; including
+  // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
+  size_t getDriverAllocatedMemory() const { return current_allocated_size(); }
+  // recommended Max memory for Metal
+  size_t getRecommendedMaxMemory() const { return max_device_size(); }
+  // (see enum DebugVerbosity for description)
+  uint32_t getDebugVerbosity() const { return m_debug_verbosity; }
+  // returns the device that we allocate from
+  inline id<MTLDevice> Device() const { return m_device; }
+
+  // TODO: make a common function to do size unit conversions in PyTorch.
+  inline std::string format_size(uint64_t size) const;
+
+private:
+  // (see m_high_watermark_ratio for description)
+  constexpr static double default_high_watermark_ratio = 1.7;
+  // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
+  constexpr static double default_high_watermark_upper_bound = 2.0;
+  // (see m_low_watermark_ratio for description)
+  // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
+  constexpr static double default_low_watermark_ratio_unified  = 1.4;
+  constexpr static double default_low_watermark_ratio_discrete = 1.0;
+
+  const id<MTLDevice> m_device;
+  std::recursive_mutex m_mutex;
+  // allocated buffers by device pointer
+  ska::flat_hash_map<const void*, BufferBlock*> m_allocated_buffers;
+  // using a container for pools to simplify iterating them
+  ska::flat_hash_map<BufferPool::Kind, std::unique_ptr<BufferPool>> m_pools;
+  // total memory allocated by HeapAllocator (including blocks in pools)
+  size_t m_total_allocated_memory = 0;
+  // currently active memory allocations in use (i.e., blocks not in pools)
+  size_t m_current_allocated_memory = 0;
+  // max buffer size allowed by Metal
+  size_t m_max_buffer_size = 0;
+  // maximum total size allowed to be allocated
+  size_t m_max_total_allowed_size = 0;
+  // high watermark ratio is a hard limit for the total allowed allocations
+  // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs)
+  // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize)
+  // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize
+  // e.g., value 0.95 means we allocate up to 95% of recommended maximum
+  // allocation size; beyond that, the allocations would fail with OOM error.
+  double m_high_watermark_ratio;
+  // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark
+  // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit).
+  // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection)
+  // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum
+  // allocation size.
+  double m_low_watermark_ratio;
+  // low watermark size limit (in Bytes) at the time we initialize the allocator
+  size_t m_low_watermark_limit;
+  // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity
+  uint32_t m_debug_verbosity;
+  // default MPS stream
+  MPSStream* m_stream;
+  // we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator
+  std::shared_ptr<MPSEventPool> m_event_pool;
+
+  void init_allocator();
+  void init_buffer_pools();
+  HeapBlock* get_free_heap(AllocParams& params);
+  bool get_free_buffer(AllocParams& params);
+  BufferBlock* get_allocated_buffer_block(const void* ptr);
+  BufferBlock* alloc_buffer_block(size_t size, uint32_t usage);
+  bool alloc_buffer(AllocParams& params);
+  void free_buffer(BufferBlock* buffer_block);
+  // returns true if the container heap is also released
+  bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true);
+  void release_buffers(BufferPool& pool);
+  bool release_available_cached_buffers(AllocParams& params);
+  bool release_cached_buffers();
+  // free unused cached blocks to reclaim GPU memory if memory pressure is high
+  void garbage_collect_cached_buffers(AllocParams& params);
+  // returns the suitable buffer pool type for the usage or
+  // requested/allocated sizes
+  BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage);
+  // returns the aligned allocation size that is optimized
+  // for the buffers to get reused frequently
+  size_t get_allocation_size(size_t size, uint32_t usage) const;
+  // maximum size of device memory available for allocation in current process
+  // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
+  size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; }
+  // there are implicit allocations from MPS backend, so we need to query the 'device' for
+  // total allocated size instead of manually tracking in MPSAllocator
+  size_t current_allocated_size() const { return [m_device currentAllocatedSize]; }
+
+  bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
+    for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
+      MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event);
+    }
+    return true;
+  }
+};
+
+} // namespace at::mps::HeapAllocator
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h
new file mode 100644
index 0000000000000000000000000000000000000000..9aa4769f76ed68669313be64ed75177a743df5bd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h
@@ -0,0 +1,64 @@
+//  Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/util/Registry.h>
+#include <ATen/core/ATen_fwd.h>
+
+#define MB(x) (x * 1048576UL)
+
+namespace at::mps {
+
+// this is a public interface to access MPSAllocator.
+// Do not declare methods that would depend on MPS or Metal frameworks.
+class IMPSAllocator : public c10::Allocator {
+public:
+  // see the comments in MPSAllocator.h for the description of these methods.
+  virtual void emptyCache() const = 0;
+  virtual void freeInactiveBuffers() const = 0;
+  virtual ssize_t getUnalignedBufferSize(const void* ptr) const = 0;
+  virtual IntArrayRef getBufferShape(const void* ptr) const = 0;
+  virtual id_t getBufferId(const void* ptr) const = 0;
+  virtual void setBufferShape(const void* ptr, const IntArrayRef& shape) const = 0;
+  virtual bool isSharedBuffer(const void* ptr) const = 0;
+  virtual bool isSharedStorageSupported() const = 0;
+  virtual c10::DataPtr allocScalarBufferWithValue(void* value, size_t size) const = 0;
+  virtual std::string formatSize(size_t size) const = 0;
+  virtual void setLowWatermarkRatio(double ratio) const = 0;
+  virtual void setHighWatermarkRatio(double ratio) const = 0;
+  virtual ssize_t getLowWatermarkValue() const = 0;
+  virtual size_t getLowWatermarkLimit() const = 0;
+  virtual size_t getHighWatermarkLimit() const = 0;
+  virtual size_t getTotalAllocatedMemory() const = 0;
+  virtual size_t getCurrentAllocatedMemory() const = 0;
+  virtual size_t getDriverAllocatedMemory() const = 0;
+  virtual size_t getRecommendedMaxMemory() const = 0;
+  virtual std::pair<const void*, uint32_t> getSharedBufferPtr(const void* ptr) const = 0;
+  virtual bool recordEvents(c10::ArrayRef<const void*> buffers) const = 0;
+  virtual bool waitForEvents(c10::ArrayRef<const void*> buffers) const = 0;
+};
+
+class IMpsAllocatorCallback {
+ public:
+  enum class EventType {
+    ALLOCATED, // buffer got allocated to be used immediately
+    RECYCLED,  // buffer pulled from free list to be reused
+    FREED,     // buffer put to free list for future recycling
+    RELEASED,  // buffer memory released
+    ALLOCATION_FAILED // buffer allocation failed
+  };
+  virtual ~IMpsAllocatorCallback() = default;
+  virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0;
+};
+
+// MPS allocator will execute every registered callback when a block of memory is freed.
+C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback);
+#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \
+  C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__);
+
+IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false);
+
+bool isMPSPinnedPtr(const void* data);
+
+} // namespace at::mps
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSDevice.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a7df3ba3620c66a7c2f490090df6e171224646c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSDevice.h
@@ -0,0 +1,84 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <c10/core/Allocator.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLDevice> MTLDevice_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+#else
+typedef void* MTLDevice;
+typedef void* MTLDevice_t;
+typedef void* MTLLibrary_t;
+typedef void* MTLComputePipelineState_t;
+typedef void* MTLLibrary_t;
+#endif
+
+namespace at::mps {
+
+// Helper enum to check if a MPSGraph op is supported in a given macOS version
+enum class MacOSVersion : uint32_t {
+  MACOS_VER_13_1_PLUS = 0,
+  MACOS_VER_13_2_PLUS,
+  MACOS_VER_13_3_PLUS,
+  MACOS_VER_14_0_PLUS,
+  MACOS_VER_14_4_PLUS,
+  MACOS_VER_15_0_PLUS,
+};
+
+//-----------------------------------------------------------------
+//  MPSDevice
+//
+// MPSDevice is a singleton class that returns the default device
+//-----------------------------------------------------------------
+
+class TORCH_API MPSDevice {
+ public:
+  /**
+   * MPSDevice should not be cloneable.
+   */
+  MPSDevice(MPSDevice& other) = delete;
+  /**
+   * MPSDevice should not be assignable.
+   */
+  void operator=(const MPSDevice&) = delete;
+  /**
+   * Gets single instance of the Device.
+   */
+  static MPSDevice* getInstance();
+  /**
+   * Returns the single device.
+   */
+  MTLDevice_t device() {
+    return _mtl_device;
+  }
+  /**
+   * Returns whether running on Ventura or newer
+   */
+  bool isMacOS13Plus(MacOSVersion version) const;
+
+  MTLComputePipelineState_t metalIndexingPSO(const std::string &kernel);
+  MTLLibrary_t getMetalIndexingLibrary();
+
+  ~MPSDevice();
+
+ private:
+  static MPSDevice* _device;
+  MTLDevice_t _mtl_device;
+  MTLLibrary_t _mtl_indexing_library;
+  MPSDevice();
+};
+
+TORCH_API bool is_available();
+TORCH_API bool is_macos_13_or_newer(MacOSVersion version);
+TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
+
+} // namespace at::mps
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h
new file mode 100644
index 0000000000000000000000000000000000000000..880ff1c75d12e17ecf719f3de875c2217f852e51
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h
@@ -0,0 +1,100 @@
+//  Copyright © 2023 Apple Inc.
+
+#pragma once
+
+#include <ATen/mps/MPSStream.h>
+#include <ctime>
+#include <stack>
+
+namespace at::mps {
+
+// NOTE: don't create instances of this class directly.
+// Use MPSEventPool to acquire instances of MPSEvent.
+class MPSEvent {
+public:
+  explicit MPSEvent(id_t ID, MPSStream* stream, bool enable_timing);
+  ~MPSEvent();
+
+  // records an event on the stream
+  void record(bool needsLock, bool syncEvent = false);
+  // makes all future work submitted to the stream wait for this event.
+  bool wait(bool needsLock, bool syncEvent = false);
+  // schedules a notifyListener callback for the event.
+  bool notify(bool needsLock, MTLSharedEventNotificationBlock block);
+  // checks if events are already signaled.
+  bool query() const;
+  // blocks the CPU thread until all the GPU work that were scheduled
+  // prior to recording this event are completed.
+  bool synchronize();
+  // resets this event with new parameters in case it gets reused from the event pool
+  void reset(MPSStream* stream, bool enable_timing);
+  // returns the unique ID of the event instance
+  id_t getID() const { return m_id; }
+  // returns the completion timestamp of the event
+  uint64_t getCompletionTime() const { return m_completion_time; }
+  // if already recorded, waits for cpu_sync_cv to be signaled
+  void waitForCpuSync();
+
+private:
+  id_t m_id;
+  // enables measuring the completion time of the notifyListener of this event
+  bool m_enable_timing;
+  uint64_t m_signalCounter = 0;
+  MPSStream* m_stream = nullptr;
+  MTLSharedEvent_t m_event = nullptr;
+  MTLSharedEventListener* m_listener = nullptr;
+  // used to sync the events created on this Stream with CPU
+  std::mutex m_cpu_sync_mutex{};
+  std::condition_variable m_cpu_sync_cv{};
+  // CondVar predicate to sync the events created on this Stream with CPU
+  bool m_cpu_sync_completed = false;
+  // used to compute elapsed time
+  uint64_t m_completion_time = 0;
+
+  void recordLocked(bool syncEvent);
+  bool waitLocked(bool syncEvent);
+  bool notifyLocked(MTLSharedEventNotificationBlock block);
+  void notifyCpuSync();
+  static uint64_t getTime() {
+    return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
+  }
+};
+
+typedef std::unique_ptr<MPSEvent, std::function<void(MPSEvent*)>> MPSEventPtr;
+
+class MPSEventPool {
+public:
+  explicit MPSEventPool(MPSStream* default_stream);
+  ~MPSEventPool();
+
+  MPSEventPtr acquireEvent(bool enable_timing, MPSStream* stream);
+  void emptyCache();
+
+  // these are mainly used for MPSHooks and torch.mps.Event() bindings
+  id_t acquireEvent(bool enable_timing);
+  void releaseEvent(id_t event_id);
+  void recordEvent(id_t event_id, bool syncEvent);
+  void waitForEvent(id_t event_id, bool syncEvent);
+  void synchronizeEvent(id_t event_id);
+  bool queryEvent(id_t event_id);
+  // returns elapsed time between two recorded events in milliseconds
+  double elapsedTime(id_t start_event_id, id_t end_event_id);
+
+private:
+  MPSStream* m_default_stream = nullptr;
+  std::recursive_mutex m_mutex;
+  std::stack<std::unique_ptr<MPSEvent>> m_pool{};
+  // dictionary to associate event IDs with event objects
+  // used to retain in-use events out of the pool
+  // for torch.mps.Event() bindings.
+  std::unordered_map<id_t, MPSEventPtr> m_in_use_events{};
+  uint64_t m_event_counter = 0;
+  std::function<void(MPSEvent*)> m_default_deleter;
+
+  MPSEvent* getInUseEvent(id_t event_id, bool locked = true);
+};
+
+// shared_ptr is used to get MPSEventPool destroyed after dependent instances
+std::shared_ptr<MPSEventPool> getMPSEventPool();
+
+} // namespace at::mps
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..98cb66207ab435c00d10366f24f967e3aa30eff8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h
@@ -0,0 +1,52 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <ATen/core/PhiloxRNGEngine.h>
+#include <c10/core/GeneratorImpl.h>
+#include <optional>
+
+namespace at {
+namespace mps::detail {
+
+constexpr uint32_t PHILOX_STATE_N = 7;
+struct rng_data_pod {
+  std::array<uint32_t, PHILOX_STATE_N> state{1};
+  uint64_t seed = default_rng_seed_val;
+};
+
+TORCH_API const Generator& getDefaultMPSGenerator();
+TORCH_API Generator createMPSGenerator(uint64_t seed_val = default_rng_seed_val);
+
+} // namespace mps::detail
+
+struct TORCH_API MPSGeneratorImpl : public c10::GeneratorImpl {
+  // Constructors
+  MPSGeneratorImpl(uint64_t seed_in = default_rng_seed_val);
+  ~MPSGeneratorImpl() override = default;
+
+  // MPSGeneratorImpl methods
+  std::shared_ptr<MPSGeneratorImpl> clone() const;
+  void set_current_seed(uint64_t seed) override;
+  void set_offset(uint64_t offset) override;
+  uint64_t get_offset() const override;
+  uint64_t current_seed() const override;
+  uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
+  void update_philox_counters();
+
+  void set_engine(at::Philox4_32 engine) { engine_ = engine; };
+  at::Philox4_32 engine() { return engine_; };
+  uint32_t* state_data() { return data_.state.data(); }
+  static DeviceType device_type() { return DeviceType::MPS; };
+
+private:
+  mps::detail::rng_data_pod data_;
+  at::Philox4_32 engine_;
+
+  MPSGeneratorImpl* clone_impl() const override;
+};
+
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGuardImpl.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGuardImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb50df2faeaeedc91087e6f90861835cbb5765f8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGuardImpl.h
@@ -0,0 +1,179 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <ATen/Context.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSEvent.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#endif
+
+#include <ATen/Tensor.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorImpl.h>
+#include <sys/_types/_size_t.h>
+#include <memory>
+#include <c10/core/UndefinedTensorImpl.h>
+#include <c10/util/intrusive_ptr.h>
+
+
+namespace at::mps {
+
+typedef MPSEvent* mpsEvent_t;
+
+// TODO: Move the MPSGuardImpl to inherit from NoOpDeviceGuardImpl
+// https://github.com/pytorch/pytorch/issues/77170
+struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface {
+  static constexpr c10::DeviceType static_type = c10::DeviceType::MPS;
+
+  // constructor
+  MPSGuardImpl() {}
+  explicit MPSGuardImpl(c10::DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == c10::DeviceType::MPS);
+  }
+
+  // returns the type
+  c10::DeviceType type() const override {
+    return c10::DeviceType::MPS;
+  }
+
+  Device exchangeDevice(Device d) const override {
+    return Device(c10::DeviceType::MPS, 0);
+  }
+
+  Device getDevice() const override {
+    return Device(c10::DeviceType::MPS, 0);
+  }
+
+  std::optional<Device> uncheckedGetDevice() const noexcept {
+    return Device(c10::DeviceType::MPS, 0);
+  }
+
+  void setDevice(Device d) const override {
+    TORCH_INTERNAL_ASSERT(d.is_mps());
+  }
+
+  void uncheckedSetDevice(Device d) const noexcept override {
+    // TODO: Currently setting only device 0
+  }
+
+  Stream getStream(Device d) const noexcept override {
+    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
+  }
+
+  Stream getNewStream(Device, int priority = 0) const override {
+    (void)priority;
+    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
+  }
+
+  Stream getDefaultStream(Device d) const override {
+    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
+  }
+
+  // NB: These do NOT set the current device
+  Stream exchangeStream(Stream s) const noexcept override {
+    return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0));
+  }
+  DeviceIndex deviceCount() const noexcept override {
+    if (at::hasMPS()) {
+      //TODO: extend it for multi-device case
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  // Event-related functions
+  void createEvent(
+    mpsEvent_t* event,
+    const EventFlag flag) const;
+
+  void destroyEvent(
+    void* event,
+    const DeviceIndex device_index) const noexcept override;
+
+  void record(
+    void** event,
+    const Stream& stream,
+    const DeviceIndex device_index,
+    const EventFlag flag) const override;
+
+  void block(
+    void* event,
+    const Stream& stream) const override;
+
+  bool queryEvent(void* event) const override;
+
+};
+
+/// A variant of OptionalDeviceGuard that is specialized for MPS.
+struct OptionalMPSGuard {
+  explicit OptionalMPSGuard() : guard_() {}
+
+  explicit OptionalMPSGuard(std::optional<Device> device_opt)
+      : guard_(device_opt) {}
+
+  /// Set the current MPS device to the passed device index, if it is not
+  /// nullopt
+  explicit OptionalMPSGuard(std::optional<DeviceIndex> device_index_opt)
+      : guard_(device_index_opt) {}
+
+  // Copy is not allowed
+  OptionalMPSGuard(const OptionalMPSGuard&) = delete;
+  OptionalMPSGuard& operator=(const OptionalMPSGuard&) = delete;
+  OptionalMPSGuard(OptionalMPSGuard&& other) = delete;
+  OptionalMPSGuard& operator=(OptionalMPSGuard&& other) = delete;
+
+  /// Sets the MPS device to the given device, initializing the guard if it
+  /// is not already initialized.  Errors if the given device is not a MPS
+  /// device.
+  void set_device(Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the MPS device to the given device, initializing the guard if it is
+  /// not already initialized.  Errors if the given device is not a MPS device.
+  void reset_device(Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the MPS device to the given device index, initializing the guard if
+  /// it is not already initialized.
+  void set_index(DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set immediately prior to initialization of the
+  /// guard, or nullopt if the guard is uninitialized.
+  std::optional<Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  std::optional<Device> current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Restore the original MPS device, resetting this guard to uninitialized
+  /// state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalDeviceGuard<MPSGuardImpl> guard_;
+};
+
+
+C10_REGISTER_GUARD_IMPL(MPS, MPSGuardImpl);
+
+} // namespace at::mps
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSHooks.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSHooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..4858c0609f56b3e92ac0c6a5a012b20ff736e2fb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSHooks.h
@@ -0,0 +1,60 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/detail/MPSHooksInterface.h>
+#include <ATen/Generator.h>
+#include <ATen/mps/MPSEvent.h>
+#include <optional>
+
+namespace at::mps {
+
+// The real implementation of MPSHooksInterface
+struct MPSHooks : public at::MPSHooksInterface {
+  MPSHooks(at::MPSHooksArgs) {}
+  void initMPS() const override;
+
+  // MPSDevice interface
+  bool hasMPS() const override;
+  bool isOnMacOSorNewer(unsigned major, unsigned minor) const override;
+
+  // MPSGeneratorImpl interface
+  const Generator& getDefaultMPSGenerator() const override;
+
+  // MPSStream interface
+  void deviceSynchronize() const override;
+  void commitStream() const override;
+  void* getCommandBuffer() const override;
+  void* getDispatchQueue() const override;
+
+  // MPSAllocator interface
+  Allocator* getMPSDeviceAllocator() const override;
+  void emptyCache() const override;
+  size_t getCurrentAllocatedMemory() const override;
+  size_t getDriverAllocatedMemory() const override;
+  size_t getRecommendedMaxMemory() const override;
+  void setMemoryFraction(double ratio) const override;
+  bool isPinnedPtr(const void* data) const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+
+  // MPSProfiler interface
+  void profilerStartTrace(const std::string& mode, bool waitUntilCompleted) const override;
+  void profilerStopTrace() const override;
+
+  // MPSEvent interface
+  uint32_t acquireEvent(bool enable_timing) const override;
+  void releaseEvent(uint32_t event_id) const override;
+  void recordEvent(uint32_t event_id) const override;
+  void waitForEvent(uint32_t event_id) const override;
+  void synchronizeEvent(uint32_t event_id) const override;
+  bool queryEvent(uint32_t event_id) const override;
+  double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const override;
+
+  // Compatibility with Accelerator API
+  bool hasPrimaryContext(DeviceIndex device_index) const override {
+    // When MPS is available, it is always in use for the one device.
+    return true;
+  }
+};
+
+} // namespace at::mps
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ee9db5dd3242c72a93d2e4ebdf1a41670f98f48
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h
@@ -0,0 +1,402 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSAllocatorInterface.h>
+
+#include <os/signpost.h>
+#include <os/log.h>
+
+#include <atomic>
+#include <ctime>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace at::mps {
+
+namespace Profiler {
+
+struct BaseInfo {
+  // profiling info types
+  enum class Type {
+    GRAPH,
+    KERNEL,
+    COPY,
+    CPU_FALLBACK,
+  };
+
+  BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) :
+      type(infoType), profileId(Id), handle(Handle) { }
+  virtual ~BaseInfo() = default;
+
+  // type of profiling info
+  Type type;
+  // unique profile ID for execution instances of operations or copies
+  uint64_t profileId;
+  // ID generated by os_signpost
+  // since it's possible to use event and interval-based signposts at the
+  // same time, we need separate IDs for each.
+  os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0;
+  // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime")
+  std::atomic<double> totalGpuTime{0.0};
+  // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime")
+  std::atomic<double> totalSchedulingTime{0.0};
+  // indicates if the operation or copy execution has completed
+  std::atomic_bool completed{false};
+  // handle used to identify the profile info's instance (usually the pointer)
+  const uintptr_t handle;
+
+  virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const;
+  // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()])
+  static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) {
+    if (tensor.defined()) {
+      std::stringstream tensorStr;
+      auto deviceType = tensor.device().type();
+      tensorStr << c10::DeviceTypeName(deviceType);
+      // see comments for INCLUDE_BUFFER_ID
+      if (includeBufferId && deviceType == at::kMPS) {
+        id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+        tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer))
+                  << ":" << buffer.retainCount << ")";
+      }
+      tensorStr << ":"
+                << tensor.scalar_type() << tensor.sizes();
+      return tensorStr.str();
+    } else {
+      return "undefined";
+    }
+  }
+  static uint64_t getTime() {
+    return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
+  }
+};
+
+struct OperationInfo : BaseInfo {
+  OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) :
+      BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { }
+
+  uint64_t runCount = 0;
+  std::string strKey;
+
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+
+  // builds a string for a kernel
+  static std::string buildKernelString(const std::string& kernelName,
+                                       const TensorList& tensors,
+                                       bool includeBufferId = false) {
+    std::stringstream kernelStr;
+    kernelStr << kernelName;
+    for (const Tensor& tensor: tensors) {
+      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
+    }
+    return kernelStr.str();
+  }
+};
+
+struct CpuFbInfo : BaseInfo {
+  CpuFbInfo(uint64_t Id, const std::string& OpName) :
+      BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { }
+
+  uint64_t runCount = 0;
+  // the current and total overhead of copies in bytes required to convert the Op's
+  // input tensors from MPS to CPU and then output from CPU back to MPS
+  size_t currentCopyOverhead = 0;
+  size_t totalCopyOverhead = 0;
+  std::string opName;
+  std::string strKey;
+  uint64_t startTime = 0;
+
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+
+  void updateCopyOverhead(const TensorList& tensors) {
+    currentCopyOverhead = 0;
+    for (const Tensor& tensor: tensors) {
+      if (tensor.defined()) {
+        currentCopyOverhead += tensor.nbytes();
+      }
+    }
+    totalCopyOverhead += currentCopyOverhead;
+  }
+};
+
+struct CopyInfo : BaseInfo {
+  enum class Kind {
+    MPS_TO_MPS,
+    MPS_TO_CPU,
+    CPU_TO_MPS,
+  };
+
+  CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) :
+           BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS),
+           length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { }
+
+  Kind kind;
+  size_t length;
+  bool isNonBlocking;
+  bool usesBlitter;
+  std::string srcStrKey;
+  std::string dstStrKey;
+  // for copies that don't use blitters, we measure CPU time
+  uint64_t startTime = 0;
+
+  const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override;
+
+  static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false);
+
+  static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) {
+    if (tensor.has_value()) {
+      return tensor->device().type() == at::kMPS;
+    }
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer);
+    // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device
+    return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0;
+  }
+
+  static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer,
+                          const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) {
+    const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor);
+    const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS);
+    if (isSrcOnMPS && !isDstOnMPS) {
+      return Kind::MPS_TO_CPU;
+    } else if (!isSrcOnMPS && isDstOnMPS) {
+      return Kind::CPU_TO_MPS;
+    }
+    return Kind::MPS_TO_MPS;
+  }
+};
+
+struct CopyStat : CopyInfo {
+  explicit CopyStat(std::string CopyKindStr) :
+          CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {}
+  // total number of copies
+  size_t totalCount = 0;
+  // number of Scalar copies (i.e., less than sizeof(int64))
+  size_t scalarsCount = 0;
+  // number of blocking copies (i.e., require syncing to GPU)
+  size_t blockingCount = 0;
+  // number of copies that used memcpy(), instead of Metal Blit Encoder
+  size_t memcpyCount = 0;
+  // accumulated GPU time in ms for the scalar copies
+  std::atomic<double> scalarsGpuTime{0.0};
+  // copy kind in string type
+  std::string kindStr;
+};
+
+class MPSProfiler {
+public:
+  // lower 16 bits used for profiler options
+  enum ProfileOptions : uint32_t {
+    OPTIONS_NONE = 0,
+    // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.)
+    // (used for convenience to not compute bit flags by OR-ing manually)
+    // trace all signpost types using events
+    ALL_SIGNPOST_EVENTS    = (1 << 0),
+    // trace all signpost types using intervals
+    ALL_SIGNPOST_INTERVALS = (1 << 1),
+    // always wait for command buffer to finish executing after each commit
+    WAIT_UNTIL_COMPLETED   = (1 << 2),
+    // for interval-based signposts, include the scheduling portion of
+    // Graph/Kernel/Copy executions as well.
+    // if flag is disable, only "GPU run time" is included in interval,
+    // and not schedule time.
+    INCLUDE_SCHEDULE_INTERVAL = (1 << 3),
+
+    // use these if you need to trace signposts types individually (rarely required)
+    // trace signpost using intervals
+    USE_INTERVALS = (1 << 4),
+    // trace signpost by emitting events
+    USE_EVENTS    = (1 << 5),
+    // used for sanity check (Change this when new option added)
+    OPTIONS_COUNT = (USE_EVENTS << 1) - 1,
+  };
+
+  // when adding new types, #define the type string in MPSProfiler.mm as well.
+  // upper 16 bits used for event types
+  enum SignpostTypes : uint32_t {
+    SIGNPOST_NONE = 0,
+    // trace signposts for PyTorch operation executions
+    RUN_OPERATION = (1 << 16),
+    // trace signposts for blitter copies
+    BLIT_COPY     = (1 << 17),
+    // trace signposts for ops that fall back on CPU
+    CPU_FALLBACK  = (1 << 18),
+    // used for sanity check (Change this when new type added)
+    SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1,
+  };
+
+  enum LogOptions : uint32_t {
+    LOG_NONE = 0,
+
+    // Info logging options during execution
+    // -------------------------------------
+    // prints operation info (id/key/run_count) during execution
+    OPERATION_INFO      = (1 << 0),
+    // prints copy info (src/dst tensors/buffers, size, etc.) during execution
+    COPY_INFO           = (1 << 1),
+    // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution
+    CPU_FALLBACK_INFO   = (1 << 2),
+
+    // Profiling Statistics logging options when process terminates
+    // ------------------------------------------------------------
+    // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates
+    // this is convenient to not combine following stats bit flags manually
+    ALL_STATS           = (1 << 3),
+    // prints operation stats (GPU times, run count, etc.) before process terminates
+    OPERATION_STATS     = (1 << 4),
+    // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates
+    COPY_STATS          = (1 << 5),
+    // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies
+    // for tensors, etc.) before process terminates
+    CPU_FALLBACK_STATS  = (1 << 6),
+
+    // Metadata format options when logging the info
+    // ---------------------------------------------
+    // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime
+    // from Metal Command Buffers) (e.g., [GPU=0.324 ms])
+    INCLUDE_GPU_TIME    = (1 << 7),
+    // if enabled, includes GPU scheduling time in metadata separately
+    // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers)
+    // e.g., [GPU=0.324 ms, KRNL=0.036 ms]
+    INCLUDE_KERNEL_TIME = (1 << 8),
+    // if enabled, includes the unique buffer ID in metadata for the storage
+    // of a tensor that was allocated on MPSAllocator. This is useful (along with
+    // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved
+    // with various operations.
+    INCLUDE_BUFFER_ID   = (1 << 9),
+
+    // used for sanity check (Change this when new option added)
+    LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1,
+  };
+
+  explicit MPSProfiler();
+  ~MPSProfiler();
+
+  // the handle is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
+  // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy
+  uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph);
+  uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors);
+  uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer,
+                            const OptionalTensorRef srcTensor,
+                            const OptionalTensorRef dstTensor,
+                            size_t length, bool isNonBlocking, bool usesBlitter = true);
+  uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors);
+  void beginProfileGPUInterval(const void* handle);
+
+  void endProfileCopy(uint64_t profileId, SyncType syncType);
+  void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE);
+  void endProfileCPUFallback(const std::string& opName);
+
+  // these are used to hook into Python bindings for torch.mps.profiler module.
+  // this enables generating OS Signpost traces from MPSProfiler on-demand
+  // during runtime (instead of environment variables).
+  // The "mode" could be either "interval", "event", or both "interval,event"
+  // for interval-based and/or event-based signpost tracing.
+  void StartTrace(const std::string& mode, bool waitUntilCompleted);
+  void StopTrace();
+
+  // Abstractions for GPU trace capturing
+  bool isCaptureEnabled() const;
+  bool isCapturing() const;
+  void startCapture(const std::string& name, MPSStream* stream = nullptr);
+  void stopCapture(MPSStream* stream = nullptr);
+
+  // convenience functions to indicate whether signpost tracing or
+  // logging are enabled for the SignpostTypes
+  bool isOperationProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::RUN_OPERATION) ||
+           (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS));
+  }
+  bool isCopyProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::BLIT_COPY) ||
+           (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS));
+  }
+  bool isCPUFallbackProfilingEnabled() const {
+    return (m_signpost_types & SignpostTypes::CPU_FALLBACK) ||
+           (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS));
+  }
+  bool isSignpostTracingEnabled() const {
+    return (m_signpost_types != SignpostTypes::SIGNPOST_NONE);
+  }
+
+ private:
+  // indicates what type of signpost types are enabled and traced by MPS profiler.
+  uint32_t m_signpost_types = 0;
+  uint32_t m_profile_options = 0;
+  uint32_t m_log_options = 0;
+  uint64_t m_kernel_counter = 0;
+  uint64_t m_graph_counter = 0;
+  uint64_t m_cpu_fb_counter = 0;
+  uint64_t m_copy_counter = 0;
+  // technically, it's possible to trace both events and intervals at the same time
+  // so we use separate os_log categories for them
+  os_log_t m_os_log_events;
+  os_log_t m_os_log_intervals;
+  // stats logging could run either from destructor or signal handler
+  // so this is used to check if logging has already started.
+  std::atomic_bool hasLoggedStats{false};
+  // indicates there are pending completionHandler callbacks that haven't been called yet.
+  std::atomic_bool hasPendingCompletionHandlers{false};
+  // used to capture sigint signal to log profiling stats
+  static struct sigaction currentSigint, previousSigint;
+
+  // We use the following lists for two reasons:
+  // 1- for interval-based signposts the "begin" point won't be in same function
+  // as the "end" point where we need to be able to retrieve signpost's info
+  // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO.
+
+  // the pointer key for this map is either "MPSGraph*" or "id<MTLComputePipelineState>" for Metal Kernels
+  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
+  std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>> m_op_info_list{};
+  // the string key for this map is the op name that we fall back to execute on CPU
+  // this list is retained and could be logged along with aggregate profiling numbers when the process ends.
+  std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>> m_cpu_fb_info_list{};
+  // this list contains the info for copies, and its key is the unique profileId
+  // which is generated from m_copy_counter
+  // The copyInfo list is not retained.
+  std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{};
+  // a short list that contains copy stats
+  std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>> m_copy_stat_list{};
+
+  mutable MTLCaptureManager *captureManager = nil;
+  unsigned captureCount = 0;
+
+  void initialize();
+  void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
+  void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id,
+                           os_signpost_id_t interval_signpost_id,
+                           double gpuTime, double schedulingTime);
+  void addProfilerScheduledHandler(BaseInfo& info);
+  void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType);
+  void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
+                         const std::string& msg) const;
+  void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id,
+                             const std::string& msg) const;
+  void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const;
+
+  void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime);
+  // returns true if logging the profiling info "during the execution" is enabled
+  bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded);
+  // logs all the profiling stats that are enabled
+  void logProfilingStats();
+  // logs kernel profiling stats when the process ends.
+  void logOperationsProfilingStats(std::FILE* f) const;
+  // logs CPU Fallback profiling stats when the process ends.
+  void logCPUFallbackProfilingStats(std::FILE* f) const;
+  // logs copy profiling stats when the process ends.
+  void logCopyProfilingStats(std::FILE* f) const;
+
+  os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr);
+  static SignpostTypes getSignpostType(BaseInfo::Type infoType);
+  static void handleIntSignal(int signal);
+};
+
+} // namespace Profiler
+
+Profiler::MPSProfiler& getMPSProfiler();
+
+} // namespace at::mps
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSStream.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbaa055109042da726dd7d54a34d0d2ed4015458
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSStream.h
@@ -0,0 +1,133 @@
+//  Copyright © 2022 Apple Inc.
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/util/Exception.h>
+#include <c10/core/Stream.h>
+#include <ATen/mps/MPSDevice.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+typedef id<MTLCommandQueue> MTLCommandQueue_t;
+typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
+typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
+typedef id<MTLSharedEvent> MTLSharedEvent_t;
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLCommandQueue_t;
+typedef void* MTLCommandQueue;
+typedef void* MTLCommandBuffer_t;
+typedef void* MTLCommandBuffer;
+typedef void* MTLComputeCommandEncoder_t;
+typedef void* MTLSharedEvent_t;
+typedef void* dispatch_queue_t;
+typedef void* MTLDevice_t;
+#define nil NULL;
+#endif
+
+
+namespace at::mps {
+
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+
+enum class SyncType {
+  NONE,               // no commit to command buffer
+  COMMIT,             // commit and flush the command buffer
+  COMMIT_AND_WAIT,    // flush and wait for command buffer execution to finish
+  COMMIT_AND_CONTINUE,// commit and continue with a new underlying command buffer
+  COMMIT_ADAPTIVE,    // commit adaptively based on available memory
+};
+
+class TORCH_API MPSStream
+{
+public:
+  enum Unchecked { UNCHECKED };
+
+  /// Construct a MPSStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a MPS stream.
+  explicit MPSStream(Stream stream);
+
+  ~MPSStream();
+  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
+  dispatch_queue_t queue() const { return _serialQueue; }
+
+  MPSCommandBuffer* commandBuffer();
+  MTLComputeCommandEncoder_t commandEncoder();
+  void endKernelCoalescing();
+  void synchronize(SyncType syncType);
+  void fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t offset, SyncType syncType = SyncType::NONE);
+  void copy(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer,
+            size_t length, size_t srcOffset, size_t dstOffset,
+            uint64_t profileId, SyncType syncType = SyncType::NONE);
+  void copy_and_sync(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer,
+                     size_t length, size_t srcOffset, size_t dstOffset,
+                     bool non_blocking, uint64_t profileId);
+  void executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results, SyncType syncType = SyncType::NONE);
+  void addCompletedHandler(MTLCommandBufferHandler block);
+
+  /// Get the MPS device index that this stream is associated with.
+  c10::DeviceIndex device_index() const { return _stream.device_index(); }
+
+  MTLCommandQueue_t stream() const { return _commandQueue; };
+
+  MTLDevice_t device() const { return [_commandQueue device];}
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const { return _stream; }
+
+private:
+  Stream _stream;
+  MTLCommandQueue_t _commandQueue = nil;
+  MPSCommandBuffer* _commandBuffer = nil;
+  MPSCommandBuffer* _prevCommandBuffer = nil;
+  MTLComputeCommandEncoder_t _commandEncoder = nil;
+  MPSGraphExecutionDescriptor *_executionDescriptor = nil;
+  MPSGraphCompilationDescriptor *_compilationDescriptor = nil;
+  dispatch_queue_t _serialQueue = nullptr;
+  // CommitAndContinue is enabled by default
+  bool _enableCommitAndContinue = true;
+
+  // use synchronize() to access any of these commit functions outside MPSStream
+  void commit();
+  void commitAndWait();
+  void commitAndContinue();
+  void flush();
+};
+
+/**
+ * Get the current MPS stream
+ */
+TORCH_API MPSStream* getCurrentMPSStream();
+
+/**
+ * Get the default MPS stream
+ */
+TORCH_API MPSStream* getDefaultMPSStream();
+
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStreamImpl
+{
+ public:
+  /**
+   * Gets single instance of the MPSStream.
+   */
+  static MPSStream* getInstance();
+
+ private:
+  static MPSStream* _stream;
+  MPSStreamImpl();
+};
+
+} // namespace at::mps
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CatKernel.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CatKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5afa1add4da3f8fb4bf0a41e874e9f1c8b5de000
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CatKernel.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/core/IListRef.h>
+
+namespace at::native {
+
+using cat_serial_fn = void(*)(const Tensor &, const MaterializedITensorListRef&, int64_t);
+DECLARE_DISPATCH(cat_serial_fn, cat_serial_stub);
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Loops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Loops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7a567aa915deaf2aae7d53e3f4cd76ad4f90d04
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Loops.h
@@ -0,0 +1,394 @@
+#pragma once
+
+// This file provides two functions to help write elementwise kernels:
+//
+//   cpu_kernel(TensorIterator iter, <lambda>)
+//   cpu_kernel_vec(TensorIterator iter, <lambda>, <vec_lambda>)
+//
+// Both functions may generate vectorized code. The cpu_kernel implementation
+// relies on the compiler's auto-vectorization. The cpu_kernel_vec
+// implementation uses x86 SIMD intrinsics when available. These functions
+// are only intended to be used in the ATen/native/cpu subdirectory, since files
+// in other directories are not compiled with AVX/AVX2 enabled. See README.md
+// for more details.
+//
+// For example, to write a multiplication kernel for float:
+//
+//   cpu_kernel(iter, [](float a, float b) { return a * b; });
+//
+// Or you may write:
+//
+//   cpu_kernel_vec(iter,
+//     [](float a, float b) { return a * b; },
+//     [](Vectorized<float> a, Vectorized<float> b) { return a * b; });
+//
+// See BinaryOpsKernel.cpp for the complete implementation
+//
+//
+
+#include <cstdint>
+#include <c10/util/C++17.h>
+#include <c10/util/Load.h>
+#include <c10/util/irange.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/cpu/IsContiguous.h>
+#include <ATen/native/TensorIterator.h>
+#include <ATen/native/TensorIteratorDynamicCasting.h>
+#include <ATen/cpu/vec/vec.h>
+
+#include <utility>
+
+namespace at::native { inline namespace CPU_CAPABILITY {
+
+using namespace vec;
+
+template <typename traits, std::size_t... INDEX>
+typename traits::ArgsTuple
+dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i,
+                 std::index_sequence<INDEX...>) {
+  return std::make_tuple(
+      c10::load<typename traits::template arg<INDEX>::type>(
+          data[INDEX] + i * strides[INDEX])...);
+}
+
+template <typename traits>
+typename traits::ArgsTuple
+dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return dereference_impl<traits>(data, strides, i, Indices{});
+}
+
+template <typename traits, std::size_t... INDEX>
+typename traits::ArgsTuple
+dereference_vec_impl(char* C10_RESTRICT data[],
+                     const typename traits::result_type& opt_scalar,
+                     size_t S,
+                     int64_t i,
+                     std::index_sequence<INDEX...>) {
+  using Vec = typename traits::result_type;
+  using scalar_t = typename Vec::value_type;
+  return std::make_tuple(
+      S == INDEX + 1 ?
+      opt_scalar :
+      Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...);
+}
+
+template <typename traits>
+typename traits::ArgsTuple
+dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) {
+  using Indices = std::make_index_sequence<traits::arity>;
+  return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
+}
+
+template <typename func_t,
+    std::enable_if_t<!std::is_void_v<typename function_traits<func_t>::result_type>>* = nullptr>
+inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  using result_type = typename traits::result_type;
+  for (; i < n; i++) {
+    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
+    *out_ptr = c10::guts::apply(op, dereference<traits>(
+        &data[1],
+        &strides[1],
+        i));
+  }
+}
+
+template <typename func_t,
+    std::enable_if_t<std::is_void_v<typename function_traits<func_t>::result_type>>* = nullptr>
+inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  for (; i < n; i++) {
+    c10::guts::apply(op, dereference<traits>(
+        &data[0],
+        &strides[0],
+        i));
+  }
+}
+
+// Basic loop operation (one output, N inputs). May be auto-vectorized
+// by the compiler. Supports inputs and outputs of different types.
+template <typename func_t>
+inline void
+basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+  constexpr int ntensors = traits::arity + 1;
+
+  // Copying strides to temporary array helps auto vectorization in older GCC
+  // versions.
+  int64_t strides[ntensors];
+  for (const auto arg : c10::irange(ntensors)) {
+    strides[arg] = strides_[arg];
+  }
+
+  execute_op(data, strides, i, n, std::forward<func_t>(op));
+}
+
+// the recursive variadic template for iterating over the returned tuple
+template<class T, size_t N>
+struct TupleOutput {
+  static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
+                     const T &tuple) {
+    TupleOutput<T, N - 1>::handle(data, strides, i, tuple);
+
+    auto output = std::get<N - 1>(tuple);
+    using output_type = decltype(output);
+    output_type * out_ptr = (output_type *)(data[N - 1] + i * strides[N - 1]);
+    *out_ptr = output;
+  }
+};
+
+// Base case for the above recursive template
+template<class T>
+struct TupleOutput<T, 1> {
+  static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i,
+                     const T &tuple) {
+    auto output = std::get<0>(tuple);
+    using output_type = decltype(output);
+    output_type* out_ptr = (output_type *)(data[0] + i * strides[0]);
+    *out_ptr = output;
+  }
+};
+
+template<class... Args>
+void handle_tuple_outputs(char* C10_RESTRICT data[],
+                          const int64_t* strides,
+                          int64_t i,
+                          const std::tuple<Args...> &tuple) {
+  TupleOutput<decltype(tuple), sizeof...(Args)>::handle(data, strides, i, tuple);
+}
+
+// Loop operation for `cpu_kernel_multiple_outputs`.
+// 1. Use `c10::guts::apply` to make dynamic method invocation
+//    for the lambda passed in `cpu_kernel_multiple_outputs`.
+// 2. Iterate over the members of the returned tuple, set the corresponding
+//    output tensor by the tuple member in `handle_tuple_outputs` function.
+template <typename func_t>
+inline void
+multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) {
+  using traits = function_traits<func_t>;
+
+  using result_type = typename traits::result_type;
+  constexpr int num_outputs = std::tuple_size<result_type>::value;
+  constexpr int ntensors = traits::arity + num_outputs;
+
+  // Copying strides to temporary array helps auto vectorization in older GCC
+  // versions.
+  int64_t strides[ntensors];
+  for (const auto arg : c10::irange(ntensors)) {
+    strides[arg] = strides_[arg];
+  }
+
+  for (; i < n; i++) {
+    auto output = c10::guts::apply(op, dereference<traits>(
+      &data[num_outputs],
+      &strides[num_outputs],
+      i));
+    handle_tuple_outputs(data, strides, i, output);
+  }
+}
+
+// Explicitly vectorized loop implementation. All inputs and outputs must be
+// the same type and contiguous with one exception: a single input may be
+// a scalar (stride 0). It's position is indicated by the argument `S`. If `S`
+// is 0, then there are no scalar inputs.
+template <typename func_t, typename vec_func_t>
+inline void
+vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) {
+  using traits = function_traits<vec_func_t>;
+  using scalar_t = typename function_traits<func_t>::result_type;
+  using Vec = Vectorized<scalar_t>;
+  constexpr int ntensors = traits::arity + 1;
+
+  char* C10_RESTRICT data[ntensors];
+  for (const auto arg : c10::irange(ntensors)) {
+    data[arg] = data_[arg];
+  }
+
+  Vec opt_scalar = Vec(S > 0 ? *(scalar_t*)data[S] : scalar_t(0));
+  int64_t i = 0;
+  for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
+    auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
+    auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
+    auto out1 = c10::guts::apply(vop, std::move(args1));
+    auto out2 = c10::guts::apply(vop, std::move(args2));
+    out1.store(data[0] + i * sizeof(scalar_t));
+    out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
+  }
+  if (i < n) {
+    int64_t strides[ntensors];
+    for (const auto arg : c10::irange(ntensors)) {
+      strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t);
+    }
+    basic_loop(data, strides, i, n, std::forward<func_t>(op));
+  }
+}
+
+
+template <typename traits, typename cb_t>
+inline void unroll_contiguous_scalar_checks(
+    const int64_t* /*strides*/,
+    std::index_sequence<>,
+    cb_t&& cb) {
+  cb(0);
+}
+
+template <typename traits, typename cb_t, size_t INDEX0, size_t ...INDEX>
+inline void unroll_contiguous_scalar_checks(
+    const int64_t* strides,
+    std::index_sequence<INDEX0, INDEX...>,
+    cb_t&& cb) {
+  if (is_contiguous_scalar<traits, INDEX0 + 1>(strides)) {
+    cb(INDEX0 + 1);
+  } else {
+    unroll_contiguous_scalar_checks<traits>(strides, std::index_sequence<INDEX...>{}, std::forward<cb_t>(cb));
+  }
+}
+
+template <typename op_t, typename vop_t>
+struct VectorizedLoop2d {
+  op_t op;
+  vop_t vop;
+
+  using traits = function_traits<op_t>;
+  static constexpr int ntensors = traits::arity + 1;
+  using data_t = std::array<char*, ntensors>;
+
+  VectorizedLoop2d(op_t op, vop_t vop):
+    op(std::move(op)), vop(std::move(vop)) {}
+
+  static void advance(data_t &data, const int64_t *outer_strides) {
+    for (const auto arg : c10::irange(data.size())) {
+      data[arg] += outer_strides[arg];
+    }
+  }
+
+  void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) {
+    data_t data;
+    std::copy_n(base, ntensors, data.data());
+    const int64_t *outer_strides = &strides[ntensors];
+
+    if (is_contiguous<traits>(strides)) {
+      for (const auto i C10_UNUSED : c10::irange(size1)) {
+        vectorized_loop(data.data(), size0, 0, op, vop);
+        advance(data, outer_strides);
+      }
+    } else {
+      using Indices = std::make_index_sequence<traits::arity>;
+      unroll_contiguous_scalar_checks<traits>(strides, Indices{}, [&](size_t idx) {
+        if (idx) {
+          for (const auto i C10_UNUSED : c10::irange(size1)) {
+            vectorized_loop(data.data(), size0, idx, op, vop);
+            advance(data, outer_strides);
+          }
+        } else {
+          for (const auto i C10_UNUSED : c10::irange(size1)) {
+            basic_loop(data.data(), strides, 0, size0, op);
+            advance(data, outer_strides);
+          }
+        }
+      });
+    }
+  }
+};
+
+template <typename op_t, typename vop_t>
+VectorizedLoop2d<op_t, vop_t> make_vectorized_loop2d(
+    op_t &&op, vop_t &&vop) {
+  return VectorizedLoop2d<op_t, vop_t>(std::forward<op_t>(op), std::forward<vop_t>(vop));
+}
+
+template <typename func_t>
+void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
+  using traits = function_traits<func_t>;
+  // this could be extended to work with void return types
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  // dynamic casting not currently supported on CPU
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+
+  iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
+    // basic loop can handle 1d slices with arbitrary strides, and 1d slices is all that
+    // iter.for_each is ever sending to the loop lambda
+      basic_loop(data, strides, 0, n, op);
+  }, grain_size);
+  iter.cast_outputs();
+}
+
+// This function helps write elementwise kernels that requires multiple outputs.
+// It follows the similar structure of cpu_kernel.
+// Instead of `basic_loop` function, a new `multiple_outputs_loop` function is
+// manipulated to handle multiple return values.
+// For now `needs_dynamic_casting` check is not added as the passed lambda (`func_t`)
+// of `multiple_outputs_loop` returns `std::tuple` instead of `scalar_t`.
+// The `gpu_kernel_multiple_outputs` is also implemented without this check,
+// We could extend `needs_dynamic_casting` to support both `std::tuple` and
+// `thrust::tuple` in the future.
+template <typename func_t>
+void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) {
+  using traits = function_traits<func_t>;
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+
+  iter.for_each([&](char** data, const int64_t* strides, int64_t n) {
+    multiple_outputs_loop(data, strides, 0, n, op);
+  }, grain_size);
+  iter.cast_outputs();
+}
+
+template <bool check_dynamic_cast=true, typename func_t, typename vec_func_t>
+void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) {
+  using traits = function_traits<func_t>;
+  // this could be extended to work with void return types
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  // dynamic casting not currently supported on CPU, but some kernels (like Fill)
+  // explicitly dynamic_cast, so we give the opt-out of checking.
+  if constexpr (check_dynamic_cast) {
+    TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+  }
+
+  iter.for_each(make_vectorized_loop2d(std::forward<func_t>(op), std::forward<vec_func_t>(vop)), grain_size);
+  iter.cast_outputs();
+}
+
+template <typename func_t>
+void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) {
+  using traits = function_traits<func_t>;
+  constexpr bool result_void = std::is_void_v<typename traits::result_type>;
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity &&
+                        ((result_void && iter.noutputs() == 0) || (!result_void && iter.noutputs() == 1)));
+  // dynamic casting not currently supported on CPU
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+
+  iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
+    basic_loop(data, strides, 0, n, op);
+  }, range);
+  iter.cast_outputs();
+}
+
+template <typename func_t>
+void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) {
+  cpu_serial_kernel(iter, std::forward<func_t>(op), {0, iter.numel()});
+}
+
+template <typename func_t, typename vec_func_t>
+void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) {
+  using traits = function_traits<func_t>;
+  // this could be extended to work with void return types
+  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
+  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
+  // dynamic casting not currently supported on CPU
+  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
+
+  iter.serial_for_each(make_vectorized_loop2d(std::forward<func_t>(op), std::forward<vec_func_t>(vop)), range);
+  iter.cast_outputs();
+}
+
+template <typename func_t, typename vec_func_t>
+void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) {
+  cpu_serial_kernel_vec(iter, std::forward<func_t>(op), std::forward<vec_func_t>(vop), {0, iter.numel()});
+}
+
+}} // namespace at::native::<anonymous>
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c6424f8b0eac8a9b632666f89e8d5c8b5efd835
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h
@@ -0,0 +1,238 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <ATen/NumericUtils.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/native/ReductionType.h>
+#include <c10/util/irange.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/cpu/utils.h>
+#include <ATen/OpMathType.h>
+
+namespace at::native {
+inline namespace CPU_CAPABILITY {
+
+using namespace vec;
+
+#define AT_DISPATCH_REDUCTION_TYPES(op, ...)                                   \
+  [&] {                                                                        \
+    switch (op) {                                                              \
+      case ReductionType::SUM: {                                               \
+        static constexpr auto reduce = ReductionType::SUM;                     \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::MEAN: {                                              \
+        static constexpr auto reduce = ReductionType::MEAN;                    \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::MIN: {                                               \
+        static constexpr auto reduce = ReductionType::MIN;                     \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::MAX: {                                               \
+        static constexpr auto reduce = ReductionType::MAX;                     \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+      case ReductionType::PROD: {                                              \
+        static constexpr auto reduce = ReductionType::PROD;                    \
+        return __VA_ARGS__();                                                  \
+      }                                                                        \
+    }                                                                          \
+  }()
+
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value() {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val;
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    val = static_cast<acc_t>(0);
+  } else if (reduce == ReductionType::PROD) {
+    val = static_cast<acc_t>(1);
+  } else if (reduce == ReductionType::MAX) {
+    val = -std::numeric_limits<acc_t>::infinity();
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    val = std::numeric_limits<acc_t>::infinity();
+  }
+  return val;
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline vec_scalar_t<scalar_t> init_value(const std::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (initial.has_value()) {
+    return initial.value().to<acc_t>();
+  } else {
+    return init_value<scalar_t, reduce>();
+  }
+}
+
+template <typename scalar_t>
+inline void init(scalar_t* out, int64_t size, const vec_scalar_t<scalar_t>& val) {
+  using Vec = Vectorized<vec_scalar_t<scalar_t>>;
+  map<scalar_t>(
+      [val](Vec x) { return Vec(val); },
+      out,
+      out,
+      size);
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, const std::optional<Scalar>& initial) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  acc_t val = init_value<scalar_t, reduce>(initial);
+  init(out, size, val);
+}
+
+// overload with `include_self`, used by scatter_reduce
+template <typename scalar_t, ReductionType reduce>
+inline void init(scalar_t* out, int64_t size, bool include_self = false) {
+  using acc_t = vec_scalar_t<scalar_t>;
+  if (!include_self) {
+    acc_t val = init_value<scalar_t, reduce>();
+    init(out, size, val);
+  }
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void _init(scalar_t* self_ptr, at::opmath_type<scalar_t>* buffer_ptr, int64_t size, bool include_self) {
+  if (!include_self) {
+    init<at::opmath_type<scalar_t>, reduce>(buffer_ptr, size, include_self);
+  } else {
+    vec::convert(self_ptr, buffer_ptr, size);
+  }
+}
+
+template <typename scalar_t>
+inline typename std::enable_if<!std::is_same<scalar_t, Vec2>::value, scalar_t>::type
+_max(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::max(x, y);
+}
+
+template <typename scalar_t>
+inline Vectorized<scalar_t> _max(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::maximum propagates NaN
+  return vec::maximum(x, y);
+}
+
+template <typename vec_t>
+inline typename std::enable_if<std::is_same<vec_t, Vec2>::value, Vec2>::type
+_max(const vec_t& x, const vec_t& y) {
+  // vec::maximum propagates NaN
+  return maximum(x, y);
+}
+
+template <typename scalar_t>
+inline typename std::enable_if<!std::is_same<scalar_t, Vec2>::value, scalar_t>::type
+_min(const scalar_t& x, const scalar_t& y) {
+  return at::_isnan(y) ? y : std::min(x, y);
+}
+
+template <typename scalar_t>
+inline Vectorized<scalar_t> _min(const Vectorized<scalar_t>& x, const Vectorized<scalar_t>& y) {
+  // vec::minimum propagates NaN
+  return vec::minimum(x, y);
+}
+
+template <typename vec_t>
+inline typename std::enable_if<std::is_same<vec_t, Vec2>::value, Vec2>::type
+_min(const vec_t& x, const vec_t& y) {
+  // vec::minimum propagates NaN
+  return minimum(x, y);
+}
+
+template <typename scalar_t, typename accumut, typename Op,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void map_acc(
+    const Op& vec_fun,
+    accumut* output_data,
+    const accumut* input_data,
+    const scalar_t* input_data2,
+    int64_t size) {
+  using Vec = vec::Vectorized<scalar_t>;
+  using aVec = vec::Vectorized<accumut>;
+  int64_t d = 0;
+  constexpr int64_t kVecSize = Vec::size();
+  constexpr int64_t kaVecSize = aVec::size();
+  for (d = 0; d < size - (size % kVecSize); d += kVecSize) {
+    Vec data2_vec = Vec::loadu(input_data2 + d);
+    auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
+    aVec input_vec0 = aVec::loadu(input_data + d);
+    aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize);
+    vec_fun(input_vec0, data2_avec0).store(output_data + d);
+    vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize);
+  }
+  if (size - d > 0) {
+    int64_t tail_size = size - d;
+    Vec data2_vec = Vec::loadu(input_data2 + d, tail_size);
+    auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
+    if (tail_size > kaVecSize) {
+      aVec input_vec0 = aVec::loadu(input_data + d);
+      aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize, tail_size - kaVecSize);
+      vec_fun(input_vec0, data2_avec0).store(output_data + d);
+      vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize, tail_size - kaVecSize);
+    } else {
+      aVec input_vec0 = aVec::loadu(input_data + d, tail_size);
+      vec_fun(input_vec0, data2_avec0).store(output_data + d, tail_size);
+    }
+  }
+}
+
+// for Max and Min, propagate NaN:
+template <typename T, ReductionType reduce>
+inline T update(const T& x, const T& y) {
+  if (reduce == ReductionType::SUM ||
+      reduce == ReductionType::MEAN) {
+    return x + y;
+  } else if (reduce == ReductionType::PROD) {
+    return x * y;
+  } else if (reduce == ReductionType::MAX) {
+    return _max(x, y);
+  } else {
+    TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN);
+    return _min(x, y);
+  }
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void update(scalar_t* out, const scalar_t* data, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  map2<scalar_t>(
+      [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
+      out,
+      out,
+      data,
+      K);
+}
+
+template <typename scalar_t, ReductionType reduce,
+          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+inline void update(at::opmath_type<scalar_t>* out, const scalar_t* data, int64_t K) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  using Vec = vec::Vectorized<opmath_t>;
+  map_acc<scalar_t, opmath_t>(
+      [](Vec x, Vec y) { return update<Vec, reduce>(x, y); },
+      out,
+      out,
+      data,
+      K);
+}
+
+template <typename scalar_t, ReductionType reduce>
+inline void write(scalar_t* out, int64_t count, int64_t K) {
+  using Vec = vec::Vectorized<vec_scalar_t<scalar_t>>;
+  if (reduce == ReductionType::MEAN) {
+    if (count > 0) {
+      vec::map<scalar_t>(
+          [count](Vec x) { return x / Vec(count); },
+          out,
+          out,
+          K);
+    }
+  }
+}
+
+} // namespace CPU_CAPABILITY
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0abcdb87a9ca6f3b0029eaa787ac26947809fc69
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+
+namespace at {
+namespace native {
+
+Tensor& quantize_tensor_per_tensor_affine(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    double scale,
+    int64_t zero_point);
+Tensor& quantize_tensor_per_channel_affine(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    Tensor zero_points,
+    int64_t axis);
+
+Tensor& quantize_tensor_per_channel_float_qparams(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+Tensor& dequantize_tensor_per_tensor_affine(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    double scale,
+    int64_t zero_point);
+Tensor& dequantize_tensor_per_channel_affine(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    Tensor zero_points,
+    int64_t axis);
+Tensor& dequantize_tensor_per_channel_float_qparams(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using quantize_tensor_per_tensor_affine_fn =
+    void (*)(const Tensor& rtensor, Tensor& qtensor, double scale, int64_t zero_point);
+
+using quantize_tensor_per_channel_affine_fn = void (*)(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using quantize_tensor_per_channel_float_qparams_fn = void (*)(
+    const Tensor& rtensor,
+    Tensor& qtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using dequantize_tensor_per_tensor_affine_fn =
+    void (*)(const Tensor& qtensor, Tensor& rtensor, double scale, int64_t zero_point);
+
+using dequantize_tensor_per_channel_affine_fn = void (*)(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using dequantize_tensor_per_channel_float_qparams_fn = void (*)(
+    const Tensor& qtensor,
+    Tensor& rtensor,
+    const Tensor& scales,
+    const Tensor& zero_points,
+    int64_t axis);
+
+using quantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(const Tensor& rtensor, Tensor& qtensor, float scale, float zero_point);
+
+using dequantize_tensor_per_tensor_affine_sub_byte_fn =
+    void (*)(const Tensor& qtensor, Tensor& rtensor, float scale, float zero_point);
+
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_fn,
+    quantize_tensor_per_tensor_affine_stub);
+DECLARE_DISPATCH(
+    quantize_tensor_per_channel_affine_fn,
+    quantize_tensor_per_channel_affine_stub);
+DECLARE_DISPATCH(
+    quantize_tensor_per_channel_float_qparams_fn,
+    quantize_tensor_per_channel_float_qparams_stub);
+
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_fn,
+    dequantize_tensor_per_tensor_affine_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_channel_affine_fn,
+    dequantize_tensor_per_channel_affine_stub);
+DECLARE_DISPATCH(
+    dequantize_tensor_per_channel_float_qparams_fn,
+    dequantize_tensor_per_channel_float_qparams_stub);
+
+DECLARE_DISPATCH(
+    quantize_tensor_per_tensor_affine_sub_byte_fn,
+    quantize_tensor_per_tensor_affine_sub_byte_stub);
+
+DECLARE_DISPATCH(
+    dequantize_tensor_per_tensor_affine_sub_byte_fn,
+    dequantize_tensor_per_tensor_affine_sub_byte_stub);
+
+template <typename T>
+TORCH_API Tensor quantize_tensor(
+    Tensor rtensor,
+    Tensor qtensor,
+    double scale,
+    int64_t zero_point);
+template <typename T>
+TORCH_API Tensor dequantize_tensor(
+    Tensor qtensor,
+    Tensor rtensor,
+    double scale,
+    int64_t zero_point);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h
new file mode 100644
index 0000000000000000000000000000000000000000..31526c3ec3c52057463cd00f0dd8556160d4d2df
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h
@@ -0,0 +1,47 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <c10/core/ScalarType.h>
+
+namespace at {
+namespace native {
+
+// Quantize a float value into a uint value given scale and zero_point
+template <typename T>
+TORCH_API T quantize_val(double scale, int64_t zero_point, float value);
+// TODO combine this with quantize_val once the numerics for ARM are aligned
+// with it
+template <typename T>
+T quantize_val_arm(
+    const float scale,
+    const int32_t zero_point,
+    const float value);
+template <typename T, int precision = 8>
+void quantize_vec(
+    double scale,
+    int64_t zero_point,
+    const float* src,
+    T* dst,
+    size_t count = 8);
+template <typename T>
+TORCH_API float dequantize_val(double scale, int64_t zero_point, T value);
+template <typename T>
+TORCH_API float dequantize_vec(
+    double scale,
+    int64_t zero_point,
+    const T* src,
+    float* dst,
+    size_t count = 8);
+template <typename SRC_T, typename DST_T>
+TORCH_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);
+
+// Given a multiplier and a zero_point, requantize int32_t computed values back
+// to quantized values. See comment above
+// make_per_tensor_affine_quantizer function for the usage of int64_t
+template <typename DST_T>
+TORCH_API DST_T
+requantize_from_int(double multiplier, int64_t zero_point, int64_t src);
+
+int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f8ff918c1d2f3e421922650161aaa41eda9545f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h
@@ -0,0 +1,62 @@
+#pragma once
+#include <ATen/core/List.h>
+#include <ATen/native/ConvUtils.h>
+
+namespace at::native::quantized {
+namespace {
+// MakeConvOutputShape used from both CPU and CUDA libraries
+// and exporting symbol from torch_cpu would probably take more storage
+// than duplicating implementation which likely be inlined away
+template <int kSpatialDim>
+at::SmallVector<int64_t, kSpatialDim + 2> MakeConvOutputShape(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, kSpatialDim>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const torch::List<int64_t>& stride,
+    const torch::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation);
+
+#if defined(USE_CUDA) || defined(USE_PYTORCH_QNNPACK)
+template <>
+at::SmallVector<int64_t, 4> MakeConvOutputShape<2>(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, 2>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const at::List<int64_t>& stride,
+    const at::List<int64_t>& padding,
+    const at::List<int64_t>& dilation) {
+  const int H = input_image_shape[0];
+  const int W = input_image_shape[1];
+  const int64_t Y_H =
+      (H + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1;
+  const int64_t Y_W =
+      (W + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1;
+  return {N, M, Y_H, Y_W};
+}
+
+template <>
+at::SmallVector<int64_t, 5> MakeConvOutputShape<3>(
+    int N, // mini-batch
+    int M, // output channels
+    const std::array<int64_t, 3>& input_image_shape,
+    const std::vector<int64_t>& kernel,
+    const at::List<int64_t>& stride,
+    const at::List<int64_t>& padding,
+    const torch::List<int64_t>& dilation) {
+  const int D = input_image_shape[0];
+  const int H = input_image_shape[1];
+  const int W = input_image_shape[2];
+  const int64_t Y_D =
+      (D + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1;
+  const int64_t Y_H =
+      (H + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1;
+  const int64_t Y_W =
+      (W + 2 * padding[2] - dilation[2] * (kernel[2] - 1) - 1) / stride[2] + 1;
+  return {N, M, Y_D, Y_H, Y_W};
+}
+
+#endif
+} // anonymous namespace
+} // namespace at::native::quantized
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/Copy.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/Copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..d52c8ff0fb2c7f7f6eed17acceb660482144eef9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/Copy.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+
+Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src);
+}
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fb7cfbb0e721f83ba5a9194ad72ea98c97d997d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+
+struct TensorIterator;
+
+namespace native {
+
+using fake_quant_tensor_cachemask_fn = void (*)(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    float sc,
+    int64_t z_point,
+    int64_t quant_min,
+    int64_t quant_max);
+
+using fake_quant_tensor_cachemask_tensor_qparams_fn = void (*)(
+    Tensor& output,
+    Tensor& mask,
+    const Tensor& input,
+    const Tensor& sc,
+    const Tensor& z_point,
+    const Tensor& fake_quant_enabled,
+    int64_t quant_min,
+    int64_t quant_max);
+
+using fake_quant_learnable_grad_tensor_fn = void (*)(
+    TensorIterator& iter,
+    float scale,
+    float inv_scale,
+    int64_t zero_point,
+    int64_t quant_min,
+    int64_t quant_max,
+    float grad_factor);
+
+DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub);
+DECLARE_DISPATCH(fake_quant_tensor_cachemask_tensor_qparams_fn, fake_quant_tensor_cachemask_tensor_qparams_stub);
+DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub);
+
+using fake_quant_per_channel_fn = void (*)(
+    TensorIterator &iter,
+    int64_t quant_min,
+    int64_t quant_max);
+
+using fake_quant_per_channel_cachemask_fn = void (*)(
+    TensorIterator &iter,
+    TensorIterator &iter_mask,
+    int64_t quant_min,
+    int64_t quant_max);
+
+DECLARE_DISPATCH(fake_quant_per_channel_cachemask_fn, fake_quant_per_channel_cachemask_stub);
+
+using fake_quant_learnable_per_channel_fn = void (*)(
+    TensorIterator &iter,
+    int64_t quant_min,
+    int64_t quant_max,
+    float grad_factor);
+
+DECLARE_DISPATCH(fake_quant_learnable_per_channel_fn, fake_quant_grad_learnable_channel_stub);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e240b5a8e9afc61f8828f4162f1b89c7ec06bb7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <ATen/native/TensorIterator.h>
+
+namespace at {
+namespace native {
+using masked_fill_kernel_quantized_fn = void(*)(TensorIterator& iter, const Scalar& value, double scale, int zero_point);
+using index_put_kernel_quantized_fn = void(*)(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate, double scale, int zero_point);
+
+DECLARE_DISPATCH(masked_fill_kernel_quantized_fn, masked_fill_kernel_quantized_stub);
+DECLARE_DISPATCH(index_put_kernel_quantized_fn, index_put_kernel_quantized_stub);
+
+
+} // native
+} // at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h
new file mode 100644
index 0000000000000000000000000000000000000000..d73bc0adbc4ef953e0580585ab9261700374a45d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+
+struct LinearPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+
+  // out variant of LinearPackedParamsBase::apply
+  virtual at::Tensor& apply_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  virtual at::Tensor& apply_relu_out(
+      const at::Tensor& /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/,
+      at::Tensor& output) {
+    throw std::runtime_error(
+        "apply_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32):
+  // input -> q* -> dq* -> linear* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    X: float32 Tensor, will be quantized to quint8 in the op
+  //    W_prepack: packed qint8 quantized weight and bias
+  // Returns:
+  //    Y: float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+
+  // Corresponding pattern (the ops with `*` are part of the pattern that
+  // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32):
+  // input -> q* -> dq* -> linear* -> relu* ->
+  //         qweight -> dq* /
+  //
+  // After fusion:
+  // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* ->
+  //         qweight /
+  //
+  // Additional Note: the weight is packed as well
+  // Params:
+  //    input: float32 Tensor, will be quantized to quint8 in the op
+  // Returns:
+  //    float32 Tensor
+  virtual at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) {
+    throw std::runtime_error(
+        "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed "
+        "parameter type");
+    return {};
+  }
+
+  virtual at::Tensor apply_dynamic(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+  virtual at::Tensor apply_dynamic_relu(
+      at::Tensor input,
+      bool reduce_range = false) = 0;
+
+  virtual at::Tensor& apply_dynamic_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+  virtual at::Tensor& apply_dynamic_relu_out(
+      const at::Tensor& /* input */,
+      at::Tensor& output,
+      bool /* reduce_range */) {
+    throw std::runtime_error(
+        "apply_dynamic_relu_out is not implemented for this packed "
+        "parameter type");
+    return output;
+  }
+
+  virtual std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() = 0;
+
+  virtual std::optional<at::Tensor> bias() = 0;
+
+  virtual void set_bias(std::optional<at::Tensor> /*bias*/) {
+    throw std::runtime_error(
+        "set_bias is not implemented for this packed "
+        "parameter type");
+  }
+};
+
+template <int kSpatialDim = 2>
+struct ConvPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) = 0;
+  virtual at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) = 0;
+
+  virtual std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() = 0;
+
+  virtual torch::List<int64_t> stride() const = 0;
+  virtual torch::List<int64_t> padding() const = 0;
+  virtual torch::List<int64_t> output_padding() const = 0;
+  virtual torch::List<int64_t> dilation() const = 0;
+  virtual int64_t groups() const = 0;
+  virtual bool transpose() const = 0;
+};
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf86a13c139a1f429ecb2cc4918c04df9e4b3246
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h
@@ -0,0 +1,8 @@
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+TORCH_API Tensor
+quantized_add(Tensor qa, Tensor qb, double scale, int64_t zero_point);
+}
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6f47d611a19f4bcf804b63f20fb06be9a2c1f44
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+
+struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder {
+  virtual at::Tensor embeddingbag_byte(
+    const at::Tensor& indices,
+    const std::optional<at::Tensor>& offsets,
+    bool pruned_weights,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
+    bool include_last_offset,
+    bool is_embedding_op) = 0;
+
+  virtual at::Tensor embeddingbag_4bit(
+    const at::Tensor& indices,
+    const std::optional<at::Tensor>& offsets,
+    bool pruned_weights,
+    const std::optional<at::Tensor>& per_sample_weights_,
+    const std::optional<at::Tensor>& compressed_indices_mapping,
+    bool include_last_offset,
+    bool is_embedding_op) = 0;
+
+  virtual at::Tensor unpack() = 0;
+
+  virtual int64_t bit_rate() const = 0;
+  virtual int64_t version() const = 0;
+};
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..800308544a5ec595c0b08480a5907c8d7fac1ec0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h
@@ -0,0 +1,457 @@
+#pragma once
+
+#include <ATen/Config.h>
+#if AT_MKLDNN_ENABLED()
+#include <ATen/Tensor.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ideep.hpp>
+#include <cpuinfo.h>
+
+#include <c10/util/CallOnce.h>
+
+using PrimitiveCacheKey = std::tuple<
+    double, // input_scale
+    int64_t, // input_zero_point
+    std::vector<int64_t>, // input_shape
+    double, // output_scale
+    int64_t, // output_zero_point
+    int64_t, // OMP_number_of_threads
+    double, // accum_scale
+    int64_t>; // accum_zero_point
+
+enum CacheKeyIndex {
+  InputScale,
+  InputZeroPoint,
+  InputShape,
+  OutputScale,
+  OutputZeroPoint,
+  NumOfThreads,
+};
+
+// Base class of primitive cache
+struct PrimitiveCache {
+  PrimitiveCacheKey key;
+
+  bool hit(const PrimitiveCacheKey& key) {
+    return this->key == key;
+  }
+};
+
+using LinearParams = ideep::matmul_forward_params;
+using Conv = dnnl::convolution_forward;
+using ConvDesc = dnnl::convolution_forward::primitive_desc;
+using ConvParams = ideep::convolution_forward_params;
+using Deconv = dnnl::deconvolution_forward;
+using DeconvDesc = dnnl::deconvolution_forward::primitive_desc;
+using DeconvParams = ideep::deconv_forward_params;
+
+struct LinearPrimitiveCache : PrimitiveCache {
+  LinearPrimitiveCache() {}
+
+  LinearPrimitiveCache(
+      const PrimitiveCacheKey& key,
+      const LinearParams& param) {
+    this->key = key;
+    this->param = param;
+  }
+
+  LinearParams param;
+
+  // For dynamic qlinear, scale and zero point
+  // are set at execution time. So we only need to compare
+  // the rest part of key.
+  bool hit_dynamic(const PrimitiveCacheKey& new_key) {
+    auto cached_input_shape = std::get<InputShape>(this->key);
+    auto new_input_shape = std::get<InputShape>(new_key);
+    return (
+        cached_input_shape == new_input_shape &&
+        std::get<NumOfThreads>(this->key) == std::get<NumOfThreads>(new_key));
+  }
+
+  LinearParams& get_param() {
+    return param;
+  }
+};
+
+struct ConvPrimitiveCache : PrimitiveCache {
+  ConvPrimitiveCache() {}
+
+  ConvPrimitiveCache(
+      const PrimitiveCacheKey& key,
+      const ConvParams& params) {
+    this->key = key;
+    this->params = params;
+  }
+
+  ConvParams params;
+
+  ConvParams& get_params() {
+    return params;
+  }
+};
+
+struct DeconvPrimitiveCache : PrimitiveCache {
+  DeconvPrimitiveCache() {}
+
+  DeconvPrimitiveCache(
+      const PrimitiveCacheKey& key,
+      const DeconvParams& params) {
+    this->key = key;
+    this->params = params;
+  }
+
+  DeconvParams params;
+
+  DeconvParams& get_params() {
+    return params;
+  }
+};
+
+enum PostOps {
+  NoPostOp,
+  Relu,
+  LeakyRelu,
+  Tanh,
+  Gelu
+};
+
+
+struct PackedLinearWeightsOnednn : public LinearPackedParamsBase {
+  PackedLinearWeightsOnednn(
+      std::unique_ptr<ideep::tensor> weight,
+      std::optional<ideep::tensor> bias,
+      at::Tensor orig_weight,
+      std::optional<at::Tensor> orig_bias)
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)),
+        orig_weight_(std::move(orig_weight)),
+        orig_bias_(std::move(orig_bias)) {
+    cache_initialized_flag = std::make_unique<c10::once_flag>();
+  }
+  std::unique_ptr<ideep::tensor> weight_;
+  std::optional<ideep::tensor> bias_;
+  at::Tensor orig_weight_;
+  std::optional<at::Tensor> orig_bias_;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
+
+  at::Tensor apply_leaky_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point,
+      double negative_slope);
+
+  at::Tensor apply_tanh(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point);
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  std::optional<at::Tensor> bias() override {
+    return orig_bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias);
+
+ private:
+  LinearPrimitiveCache prim_cache;
+  std::unique_ptr<c10::once_flag> cache_initialized_flag;
+
+  template <PostOps post_op>
+  at::Tensor apply_impl(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point,
+      torch::List<at::Scalar> post_op_args = torch::List<at::Scalar>());
+
+  template <bool ReluFused>
+  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range=false);
+
+  LinearPrimitiveCache& get_cache() {
+    return prim_cache;
+  }
+};
+
+template <int kSpatialDim = 2>
+struct PackedConvWeightsOnednn : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightsOnednn(
+      std::unique_ptr<ideep::tensor> weight,
+      std::optional<ideep::tensor> bias,
+      at::Tensor orig_weight,
+      std::optional<at::Tensor> orig_bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      uint8_t transpose)
+      : weight_(std::move(weight)),
+        bias_(std::move(bias)),
+        orig_weight_(std::move(orig_weight)),
+        orig_bias_(std::move(orig_bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose) {
+    cache_initialized_flag = std::make_unique<c10::once_flag>();
+  }
+
+  std::unique_ptr<ideep::tensor> weight_;
+  std::optional<ideep::tensor> bias_;
+  at::Tensor orig_weight_;
+  std::optional<at::Tensor> orig_bias_;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  uint8_t transpose_;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range) override;
+
+  at::Tensor apply_add(
+      const at::Tensor& input,
+      const at::Tensor& accum,
+      double output_scale,
+      int64_t output_zero_point);
+
+  at::Tensor apply_add_relu(
+      const at::Tensor& input,
+      const at::Tensor& accum,
+      double output_scale,
+      int64_t output_zero_point);
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return (bool)transpose_;
+  }
+
+ private:
+  ConvPrimitiveCache conv_prim_cache;
+  DeconvPrimitiveCache deconv_prim_cache;
+  std::unique_ptr<c10::once_flag> cache_initialized_flag;
+
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      const std::optional<at::Tensor>& accum,
+      double output_scale,
+      int64_t output_zero_point);
+
+  ConvPrimitiveCache& get_conv_cache() {
+    assert(!transpose());
+    return conv_prim_cache;
+  }
+
+  DeconvPrimitiveCache& get_deconv_cache() {
+    assert(transpose());
+    return deconv_prim_cache;
+  }
+};
+
+namespace onednn_utils {
+
+inline ideep::attr_t create_attr_by_post_op(
+    const c10::string_view& binary_post_op,
+    double binary_alpha,
+    double input1_scale,
+    int64_t input1_zero_point,
+    const ideep::tensor::desc& input1_desc,
+    const c10::string_view& unary_post_op,
+    const torch::List<std::optional<at::Scalar>>& unary_post_op_args,
+    const c10::string_view& unary_post_op_algorithm) {
+  using ideep::tensor;
+  if (binary_post_op == "none") {
+    if (unary_post_op == "relu") {
+      return ideep::attr_t::fuse_relu();
+    } else if (unary_post_op == "leaky_relu") {
+      TORCH_CHECK(
+          unary_post_op_args.size() == 1,
+          "onednn qlinear: expect one argument for post op leaky_relu but got ", unary_post_op_args.size(), " args");
+      auto alpha = unary_post_op_args[0].value().to<float>();
+      return ideep::attr_t::fuse_relu_v2(alpha);
+    } else if (unary_post_op == "tanh") {
+      return ideep::attr_t::fuse_tanh();
+    } else if (unary_post_op == "gelu") {
+      TORCH_CHECK(
+          unary_post_op_algorithm == "none" || unary_post_op_algorithm == "tanh",
+          "onednn qlinear: algorithm for post op gelu must be none or tanh but got ", unary_post_op_algorithm);
+      auto post_algorithm = unary_post_op_algorithm == "none" ?
+        dnnl::algorithm::eltwise_gelu_erf :
+        dnnl::algorithm::eltwise_gelu_tanh;
+      return ideep::attr_t::fuse_gelu_v2(0.f, 0.f, post_algorithm);
+    } else if (unary_post_op == "hardtanh") {
+      TORCH_CHECK(
+          unary_post_op_args.size() == 2 &&
+              unary_post_op_args[0].has_value() &&
+              unary_post_op_args[1].has_value(),
+          "hardtanh is expected to have two scalar input: min_val and max_val");
+      auto lower_bound_value =
+          unary_post_op_args[0].value().to<float>();
+      auto upper_bound_value =
+          unary_post_op_args[1].value().to<float>();
+      return ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value);
+    } else if (unary_post_op == "hardswish") {
+      return ideep::attr_t::fuse_hardswish();
+    } else if (unary_post_op == "swish") {
+      return ideep::attr_t::fuse_swish();
+    } else {
+      TORCH_CHECK(
+          unary_post_op == "none",
+          "onednn qlinear: unsupported unary post op ", unary_post_op);
+    }
+  } else if (binary_post_op == "sum") {
+    if (unary_post_op == "none") {
+      return ideep::attr_t::fuse_sum(input1_scale, input1_zero_point);
+    } else if (unary_post_op == "relu") {
+      return ideep::attr_t::residual_with_sum_zero_point(input1_scale, input1_zero_point);
+    } else {
+      TORCH_CHECK(
+          false,
+          "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op sum");
+    }
+  } else if (binary_post_op == "add") {
+    if (unary_post_op == "none") {
+      return ideep::attr_t::fuse_binary(ideep::algorithm::binary_add, input1_desc);
+    } else if (unary_post_op == "relu") {
+      ideep::post_ops po;
+      po.append_binary(ideep::algorithm::binary_add, input1_desc);
+      po.append_eltwise(ideep::algorithm::eltwise_relu, 0, 0);
+      return ideep::attr_t::attr_post_ops(po);
+    } else {
+      TORCH_CHECK(
+          false,
+          "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op add");
+    }
+  } else {
+    TORCH_CHECK(
+        false,
+        "onednn qlinear: unsupported binary post op ", binary_post_op);
+  }
+  return ideep::attr_t();
+}
+
+// ONEDNN requires symmetric quantization of weight
+// Use this util function to check.
+inline bool is_weight_symmetric_quant(
+      const at::Tensor& weight,
+      bool is_transposed_conv) {
+  bool is_symmetric = true;
+  const auto qtype = weight.qscheme();
+  if (qtype == c10::kPerTensorAffine) {
+    is_symmetric &= (weight.q_zero_point() == 0);
+  } else if (qtype == c10::kPerChannelAffine) {
+    if (is_transposed_conv) {
+      // This case is currently not supported in PyTorch
+      // but we do not want to raise an error in this util function.
+      is_symmetric = false;
+    } else {
+      auto output_channels = weight.size(0);
+      for (int i = 0; i < output_channels; ++i) {
+        auto zp = weight.q_per_channel_zero_points()[i].item<int32_t>();
+        is_symmetric &= (zp == 0);
+      }
+    }
+  } else {
+    // This case is currently not supported in PyTorch
+      // but we do not want to raise an error in this util function.
+    is_symmetric = false;
+  }
+  return is_symmetric;
+}
+
+// When qengine is x86, use this util func to check if onednn kernel
+// is preferred than fbgemm's to get better performance.
+inline bool should_use_onednn_quant(
+    const at::Tensor& weight,
+    bool is_transposed_conv,
+    int groups,
+    torch::List<int64_t> output_padding) {
+  // Performance of onednn is only validated on Linux right now.
+  // Also, the heuristics for dispatching are based on perf data on Linux.
+  // So, for x86 qengine, we always use fbgemm kernels if OS is not Linux.
+  // TODO Support more OSs.
+#if !defined(__linux__)
+  return false;
+#else
+  bool vnni_available = cpuinfo_has_x86_avx512vnni();
+  bool w_sym_quant =
+      is_weight_symmetric_quant(weight, is_transposed_conv);
+  bool opad_all_zero =
+      std::all_of(output_padding.begin(), output_padding.end(), [](int i) { return i==0; });
+  return vnni_available && (groups <= 100) && w_sym_quant && opad_all_zero;
+#endif
+}
+
+} // onednn_utils
+
+at::Tensor _qconv_prepack_onednn(
+    at::Tensor weight, // from CPU backend instead of QuantizedCPU
+    at::Tensor weight_scales, // Weight zero points must be 0 for onednn
+    double input_scale,
+    int64_t input_zero_point,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    std::optional<torch::List<int64_t>> input_shape=std::nullopt);
+
+#endif // #if AT_MKLDNN_ENABLED()
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b217c757740b30764c0b4c2b278d4fb0d1829fc6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h
@@ -0,0 +1,527 @@
+#pragma once
+
+#ifdef USE_PYTORCH_QNNPACK
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <pytorch_qnnpack.h>
+#include <qnnpack_func.h>
+#include <ATen/native/quantized/cpu/XnnpackUtils.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ATen/native/utils/Factory.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+#include <utility>
+inline int kPaddingChannels = 8;
+struct QnnpackOperatorDeleter {
+  void operator()(pytorch_qnnp_operator_t op) {
+    pytorch_qnnp_delete_operator(op);
+  }
+};
+
+// PackedWeight struct for QNNPACK stores the original Weight and Bias as
+// QNNPACK currently does not support an unpack function.
+// For PyTorch Mobile, once the model is scripted and serialized we don't need
+// to call unpack, so we can save some memory by checking for this case and free
+// the original weights after packing.
+// Input scale is set to null in pre-pack step. QNNPACK needs bias quantized
+// with input scale which is available at runtime in pytorch. During runtime if
+// input scale value changes then we requantize bias with the updated scale. For
+// inference we expect the graph to be static so the input scale should not
+// change across consecutive inference calls.
+struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
+  PackedLinearWeightsQnnp(
+      std::unique_ptr<qnnpack::PackBMatrix> w,
+      at::Tensor orig_weight,
+      at::Tensor bias,
+      std::optional<double> input_scale,
+      at::Tensor w_scales,
+      std::vector<uint8_t>&& w_zps)
+      : w(std::move(w)),
+        orig_weight(std::move(orig_weight)),
+        bias_(at::native::mobile::allocate_padded_contiguous_if_needed(
+            bias, bias.suggest_memory_format())),
+        per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine),
+        input_scale(std::move(input_scale)),
+        w_scales(std::move(w_scales)),
+        w_zero_points(std::move(w_zps)),
+        q_scheme(this->orig_weight.qscheme()) {
+    weight_sizes = this->orig_weight.sizes().vec();
+  }
+
+  std::unique_ptr<qnnpack::PackBMatrix> w;
+  at::Tensor orig_weight;
+  at::Tensor bias_;
+  bool per_channel_;
+  std::optional<double> input_scale;
+  at::Tensor w_scales;
+  std::vector<uint8_t> w_zero_points;
+  std::vector<float> requantization_scales;
+  std::vector<int64_t> weight_sizes;
+  c10::QScheme q_scheme;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  std::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias);
+
+  bool per_channel() const {
+    return per_channel_;
+  }
+
+ private:
+  std::mutex qnnp_mutex_;
+
+#ifdef USE_XNNPACK
+  xnnpack_operator xnnp_linear_op;
+
+  template <typename scalar_t, bool kReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
+
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point);
+
+  template <bool ReluFused>
+  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range);
+};
+
+template <int kSpatialDim = 2>
+struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeightsQnnp(
+      std::unique_ptr<qnnpack::PrePackConvWeights> w,
+      at::Tensor orig_weight,
+      at::Tensor bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose,
+      std::optional<double> input_scale,
+      std::vector<int64_t> kernel,
+      at::Tensor w_scale,
+      std::vector<uint8_t>&& w_zps,
+      bool is_per_channel)
+      : w(std::move(w)),
+        orig_weight(std::move(orig_weight)),
+        bias(std::move(bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose),
+        is_per_channel_(is_per_channel),
+        input_scale(input_scale),
+        kernel_(std::move(kernel)),
+        w_scales(std::move(w_scale)),
+        w_zero_points(std::move(w_zps)) {
+    const bool any_padding = std::any_of(
+        padding_.begin(), padding_.end(), [](const auto& e) { return e != 0; });
+    const size_t kernel_size =
+        std::accumulate(kernel_.begin(), kernel_.end(), 1, std::multiplies<>());
+
+    const size_t group_input_channels = transpose
+        ? this->orig_weight.size(0) / groups
+        : this->orig_weight.size(1);
+    const size_t group_output_channels = transpose
+        ? this->orig_weight.size(1)
+        : this->orig_weight.size(0) / groups;
+
+    const size_t kernel_depth = kSpatialDim == 3 ? kernel_[0] : 1;
+    const size_t kernel_height = kernel_[kSpatialDim - 2];
+    const size_t kernel_width = kernel_[kSpatialDim - 1];
+
+    pytorch_qnnp_ukernel_type ukernel_type;
+    if (transpose_) {
+      ukernel_type = pytorch_qnnp_ukernel_type_conv;
+    } else {
+      ukernel_type = pytorch_qnnp_ukernel_type_none;
+
+      const bool has_depthwise_dimensions =
+          (kSpatialDim == 2 &&
+           ((kernel_height == 3 && kernel_width == 3) ||
+            (kernel_height == 5 && kernel_width == 5))) ||
+          (kSpatialDim == 3 && kernel_height == 3 && kernel_width == 3 &&
+           kernel_depth == 3);
+      const bool has_depthwise_grouping =
+          group_input_channels == 1 && group_output_channels == 1 && groups > 1;
+
+      if (has_depthwise_dimensions && has_depthwise_grouping) {
+        ukernel_type = pytorch_qnnp_ukernel_type_dwconv;
+      } else if (
+          kernel_size == 1 &&
+          std::all_of(
+              stride_.begin(),
+              stride_.end(),
+              [](const auto& e) { return e == 1; }) &&
+          !any_padding) {
+        ukernel_type = group_input_channels >= SIZE_MAX
+            ? pytorch_qnnp_ukernel_type_xzp_gemm
+            : pytorch_qnnp_ukernel_type_gemm;
+      } else {
+        ukernel_type = pytorch_qnnp_ukernel_type_conv;
+      }
+    }
+
+    if (is_per_channel && ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) {
+      TORCH_INTERNAL_ASSERT(
+          false, "Per channel quantized weights are not supported for XZP kernels");
+    }
+
+    pytorch_qnnp_operator_t convolution{nullptr};
+    // Initially all the params are set to zero.
+    convolution = static_cast<pytorch_qnnp_operator_t>(
+        calloc(1, sizeof(struct pytorch_qnnp_operator)));
+    if (convolution == nullptr) {
+      TORCH_INTERNAL_ASSERT(
+          false, "failed to allocate %zu bytes for pytorch_qnnp_operator structure",
+          sizeof(struct pytorch_qnnp_operator));
+    }
+
+    convolution_op =
+        std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>(
+            convolution);
+
+    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+    convolution->ukernel_type = ukernel_type;
+    convolution->groups = groups;
+    convolution->group_input_channels = group_input_channels;
+    convolution->group_output_channels = group_output_channels;
+    convolution->kernel_depth = kernel_depth;
+    convolution->kernel_height = kernel_height;
+    convolution->kernel_width = kernel_width;
+    convolution->stride_depth = kSpatialDim == 3 ? stride_[0] : 1;
+    convolution->stride_height = stride_[kSpatialDim - 2];
+    convolution->stride_width = stride_[kSpatialDim - 1];
+    convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1;
+    convolution->dilation_height = dilation_[kSpatialDim - 2];
+    convolution->dilation_width = dilation_[kSpatialDim - 1];
+    convolution->input_padding_height = padding_[kSpatialDim - 2];
+    convolution->input_padding_width = padding_[kSpatialDim - 1];
+    convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0;
+    convolution->per_channel = is_per_channel_;
+    convolution->transpose = transpose_;
+
+    const uint32_t kr = pytorch_qnnp_params.q8conv.kr;
+    const size_t k_stride = (group_input_channels + (kr - 1)) & -kr;
+
+    size_t zero_size = sizeof(uint8_t) * k_stride;
+    size_t zero_offset = 0;
+
+    if (transpose_) {
+      convolution->adjustment_width = output_padding_[1];
+      convolution->adjustment_height = output_padding_[0];
+      if (group_input_channels < 8) {
+        zero_size += 8;
+        zero_offset = 8;
+      }
+    } else {
+      zero_buffer_size = 0;
+      if (any_padding) {
+        zero_size = 0;
+        zero_offset = 0;
+        if (ukernel_type == pytorch_qnnp_ukernel_type_dwconv) {
+          const uint32_t cr = pytorch_qnnp_params.q8dw9.cr;
+          const size_t group_stride = (groups + (cr - 1)) & -cr;
+          if (groups >= 8) {
+            zero_size = sizeof(uint8_t) * group_stride;
+            zero_offset = 0;
+          } else {
+            zero_size = sizeof(uint8_t) * group_stride + 8;
+            zero_offset = sizeof(uint8_t) * 8;
+          }
+        } else if (
+            ukernel_type == pytorch_qnnp_ukernel_type_conv ||
+            ukernel_type == pytorch_qnnp_ukernel_type_gemm) {
+          if (group_input_channels >= 8) {
+            zero_size = sizeof(uint8_t) * k_stride;
+            zero_offset = 0;
+          } else {
+            zero_size = sizeof(uint8_t) * k_stride + 8;
+            zero_offset = 8;
+          }
+        }
+      }
+    }
+
+    // NOLINTNEXTLINE(clang-analyzer-optin.portability.UnixAPI)
+    void* zero_buffer = malloc(zero_size);
+    if (zero_buffer == nullptr) {
+      pytorch_qnnp_delete_operator(convolution);
+      TORCH_INTERNAL_ASSERT(
+          false, "failed to allocate %zu bytes for zero padding",
+          zero_size);
+    }
+    // Need to set to input zero point
+    // memset(zero_buffer, input_zero_point, zero_size);
+    zero_buffer_size = zero_size;
+    convolution->zero_buffer = zero_buffer;
+    convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
+  }
+
+  std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter> convolution_op;
+  #ifdef USE_XNNPACK
+  xnnpack_operator xnnp_convolution_op;
+  #endif  // USE_XNNPACK
+  std::unique_ptr<qnnpack::PrePackConvWeights> w;
+  at::Tensor orig_weight;
+  at::Tensor bias;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  bool transpose_;
+  bool is_per_channel_;
+  std::optional<double> input_scale;
+  std::vector<int64_t> kernel_;
+  at::Tensor w_scales;
+  std::vector<uint8_t> w_zero_points;
+  std::vector<float> requantization_scales;
+  size_t zero_buffer_size;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+      const at::Tensor& input,
+      bool reduce_range=false) override;
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return transpose_;
+  }
+
+  bool per_channel() const {
+    return is_per_channel_;
+  }
+
+ private:
+  std::mutex qnnp_mutex_;
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+
+#ifdef USE_XNNPACK
+  template <typename scalar_t, bool ReluFused>
+  at::Tensor apply_impl_xnnp(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+#endif // USE_XNNPACK
+};
+
+enum class Activation : uint8_t { NONE = 0, RELU = 1 };
+
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+template <class T>
+inline float Round(const float x) {
+  return ::nearbyintf(x);
+}
+inline double Round(const double x) {
+  return ::nearbyint(x);
+}
+#else
+template <class T>
+inline T Round(const T x) {
+  return std::nearbyint(x);
+}
+#endif
+
+template<typename T>
+inline T QuantizeValue(float scale, int32_t zero_point, float value) {
+  const int32_t qmin = std::numeric_limits<T>::min();
+  const int32_t qmax = std::numeric_limits<T>::max();
+  auto r = zero_point + static_cast<int32_t>(Round(value / scale));
+  r = std::max(r, qmin);
+  r = std::min(r, qmax);
+  return static_cast<T>(r);
+}
+
+template<typename T>
+inline std::pair<T, T> activationLimits(
+    float scale,
+    int32_t zero_point,
+    Activation Ac) {
+  switch (Ac) {
+    case Activation::NONE:
+      return {std::numeric_limits<T>::min(),
+              std::numeric_limits<T>::max()};
+    case Activation::RELU:
+      return {QuantizeValue<T>(scale, zero_point, 0.0),
+              std::numeric_limits<T>::max()};
+    default:
+#ifdef _MSC_VER
+      __assume(0);
+#else
+      __builtin_unreachable();
+#endif
+  }
+}
+
+namespace at {
+namespace native {
+namespace qnnp_avgpool_helper {
+Tensor qnnpack_avg_pool2d(
+    Tensor input,
+    IntArrayRef kernel_size,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    std::optional<int64_t> divisor_override);
+} // qnnp_avgpool_helper
+} // namespace native
+} // namespace at
+
+namespace {
+C10_UNUSED std::vector<float> generate_requantization_scales(
+    const at::Tensor& weight_scales,
+    const float input_scale,
+    const float output_scale,
+    std::vector<float>& requant_scales) {
+  // Since weight scale is allocated with padding
+  // weight_scales.numel() gives us padded num elements.
+  const auto num_output_channels_padded = weight_scales.numel();
+  float *const weight_scales_data = weight_scales.data_ptr<float>();
+  if (static_cast<int64_t>(requant_scales.size()) < num_output_channels_padded) {
+    requant_scales.resize(num_output_channels_padded);
+  }
+  for (const auto i : c10::irange(num_output_channels_padded)) {
+    const auto inverse_output_scale = 1.f /output_scale;
+    requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale;
+    TORCH_CHECK(
+        (requant_scales[i] > 0.0f && std::isnormal(requant_scales[i])),
+        "failed to create op with requantization scale: ",
+        requant_scales[i],
+        ": requantization scale must be finite and positive");
+  }
+  return requant_scales;
+}
+
+C10_UNUSED std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
+    const at::Tensor& weight_contig,
+    bool transpose = false,
+    uint32_t groups = 1
+  ) {
+  const int out_ch_idx = transpose ? 1 : 0;
+  const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
+  // Add 8 to account for bufferring needed by QNNPACK.
+  const auto num_output_channels_padded = num_output_channels + kPaddingChannels;
+  const auto qtype = weight_contig.qscheme();
+  std::vector<uint8_t> weight_zp(num_output_channels_padded, 0);
+  // Adjust weight zero point, similar to weight data.
+  if (qtype == at::kPerTensorAffine) {
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_zp[i] = (uint8_t)(weight_contig.q_zero_point() + 128);
+    }
+  } else if (qtype == at::kPerChannelAffine) {
+    TORCH_CHECK(
+        weight_contig.q_per_channel_zero_points().scalar_type() == at::kLong,
+        "Per channel zero points dtype must be long int.");
+    const int64_t* per_channel_zero_points =
+      weight_contig.q_per_channel_zero_points().data_ptr<int64_t>();
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_zp[i] = (uint8_t)(per_channel_zero_points[i] + 128);
+    }
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+  }
+  at:: Tensor weight_scales =
+    at::empty(
+        {num_output_channels_padded},
+        at::device(at::kCPU).dtype(at::kFloat));
+  float *const weight_scales_data = weight_scales.data_ptr<float>();
+  if (qtype == at::kPerTensorAffine) {
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_scales_data[i] = weight_contig.q_scale();
+    }
+  } else if (qtype == at::kPerChannelAffine) {
+    TORCH_CHECK(
+        weight_contig.q_per_channel_scales().scalar_type() == at::kDouble,
+        "Per channel scales dtype must be double.");
+    const double *const per_channel_scales =
+      weight_contig.q_per_channel_scales().data_ptr<double>();
+    for (const auto i : c10::irange(num_output_channels)) {
+      weight_scales_data[i] = static_cast<float>(per_channel_scales[i]);
+    }
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+  }
+  for (const auto i : c10::irange(num_output_channels, num_output_channels_padded)) {
+    weight_scales_data[i] = 1.f;
+  }
+  return {weight_zp, weight_scales};
+}
+} // namespace
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b026c739786a0b68ccf779f2724c1c4607998e1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/List.h>
+#include <ATen/TensorOperators.h>
+#include <c10/util/irange.h>
+#include <algorithm>
+#include <cmath>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/quantize_per_tensor_native.h>
+#include <ATen/ops/quantize_per_channel_native.h>
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace quant_utils {
+namespace {
+  float RawUint16ToFp16(unsigned short value) {
+    // Convert raw 16 bits half precision floating point number
+    // to single precision floating point number.
+    const unsigned short sign_bits = value >> 15;
+    const unsigned short exponent_bits = value >> 10 & 0x1f;
+    const unsigned short significand_bits = value & 0x3ff;
+
+    const float sign = sign_bits ? -1 : 1;
+    const float significand =
+        1 + significand_bits * 0.0009765625f; // 0.0009765625f = 0x1p-10 = 2^-10;
+    const float exponent = exponent_bits - 0xf;
+
+    return sign * std::ldexp(significand, exponent);
+}
+
+template <typename T>
+bool CheckAndSaturate(T max_val, T* element) {
+  if (*element > max_val) {
+    *element = max_val;
+    return true;
+  }
+  if (*element < -max_val) {
+    *element = -max_val;
+    return true;
+  }
+  return false;
+}
+}
+using namespace std;
+// A structure to hold quantization parameters 'scale' and 'zero_point'.
+// The meaning of these values is as the constants in the quantization equation
+//
+//   real_value = scale * (quantized_value - zero_point)
+//
+// In other words, 'zero_point' is the quantized value that corresponds
+// to the real value 0, and 'scale' is the difference of real values
+// corresponding to consecutive quantized values.
+struct TensorQuantizationParams {
+  double scale;
+  std::int32_t zero_point;
+  int precision;
+};
+
+// Use fp16_min as the small scale cutoff because we don't want to use scales in
+// fp16 subnormal range. This is to be consistent with Glow and FakeLowP
+// implementation for NNPI.
+constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
+
+// Following implementation should be identical to fbgemm::ChooseQuantizationParams
+inline TensorQuantizationParams ChooseQuantizationParams(
+    float min,
+    float max,
+    int32_t qmin,
+    int32_t qmax,
+    bool preserve_sparsity = false,
+    bool force_scale_power_of_two = false,
+    bool reduce_range = false) {
+  TORCH_CHECK(
+      min <= max,
+      "In ChooseQuantizationParams, min should be less than or equal to max");
+
+  if (reduce_range) {
+    qmin = qmin/2;
+    qmax = qmax/2;
+  }
+  if (min < 0 && max > 0 && preserve_sparsity) {
+    int symmetric_qmin = -((qmax - qmin) / 2 + 1);
+    int symmetric_qmax = (qmax - qmin) / 2;
+    double max_scale =
+        std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax));
+    min = max_scale * symmetric_qmin;
+    max = max_scale * symmetric_qmax;
+  }
+
+  // We extend the [min, max] interval to ensure that it contains 0.
+  // Otherwise, we would not meet the requirement that 0 be an exactly
+  // representable value.
+  min = std::min(min, 0.f);
+  max = std::max(max, 0.f);
+
+  TORCH_CHECK(
+      qmin < qmax,
+      "In ChooseQuantizationParams, qmin should be less than qmax");
+
+  // Use double precision for intermediate computation but use single precision
+  // in final number to reflect the actual number used during quantization.
+  double scale = (static_cast<double>(max) - min) / (qmax - qmin);
+  // If scale is 0 or too small so its reciprocal is infinity, we arbitrary
+  // adjust the scale to 0.1 . We want to avoid scale's reciprocal being
+  // infinity because some of fbgemm code pre-computes scale's reciprocal to do
+  // multiplication instead of division in the time critical part of code.
+  if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
+    scale = 0.1;
+  }
+  TORCH_CHECK(scale > 0, "quantization scale should be > 0");
+
+  if (force_scale_power_of_two) {
+    if (scale < 1) {
+      scale = 1.0 / (1 << static_cast<int>(floor(log(1.0 / scale) / log(2))));
+    } else {
+      scale = 1 << static_cast<int>(ceil(log(scale) / log(2)));
+    }
+  }
+
+  // Cut off small scale
+  if (scale < SMALL_SCALE_THRESHOLD) {
+    float org_scale = scale;
+    scale = SMALL_SCALE_THRESHOLD;
+    // Adjust the min and max based on the new scale
+    if (min == 0.0f) {
+      max = SMALL_SCALE_THRESHOLD * (qmax - qmin);
+    } else if (max == 0.0f) {
+      min = -SMALL_SCALE_THRESHOLD * (qmax - qmin);
+    } else {
+      float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
+      min *= amplifier;
+      max *= amplifier;
+    }
+  }
+
+  // Zero-point computation.
+  // First the initial floating-point computation. The zero-point can be
+  // determined from solving an affine equation for any known pair
+  // (real value, corresponding quantized value).
+  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair
+  // will be roughly machine_epsilon * (sum of absolute values of terms)
+  // so we want to use the variant that adds the smaller terms.
+  double zero_point_from_min = qmin - min / static_cast<double>(scale);
+  double zero_point_from_max = qmax - max / static_cast<double>(scale);
+  double zero_point_from_min_error =
+      std::abs(qmin) - std::abs(min / static_cast<double>(scale));
+  double zero_point_from_max_error =
+      std::abs(qmax) - std::abs(max / static_cast<double>(scale));
+  double initial_zero_point =
+      zero_point_from_min_error < zero_point_from_max_error
+      ? zero_point_from_min
+      : zero_point_from_max;
+
+  // for symmetric quantization (preserve_sparsity == true), we force zero_point
+  // to be a middle value between qmin and qmax.
+  // If either min or max is 0, then we just use 0 as zero_point.
+  if (min < 0 && max > 0 && preserve_sparsity) {
+    initial_zero_point = static_cast<double>(qmin + qmax) / 2;
+  }
+
+  // Now we need to nudge the zero point to be an integer
+  // (our zero points are integer, and this is motivated by the requirement
+  // to be able to represent the real value "0" exactly as a quantized value,
+  // which is required in multiple places, for example in Im2col with zero
+  // padding).
+  int32_t nudged_zero_point = 0;
+  if (initial_zero_point < qmin) {
+    nudged_zero_point = qmin;
+  } else if (initial_zero_point > qmax) {
+    nudged_zero_point = qmax;
+  } else {
+    nudged_zero_point = nearbyint(initial_zero_point);
+  }
+
+  TensorQuantizationParams result;
+  result.scale = scale;
+  result.zero_point = nudged_zero_point;
+  return result;
+}
+
+// This function helps to convert the Conv1D dimensions usable by the Conv2d op.
+constexpr int64_t kConv1dSqueezeDim = 0;
+static C10_UNUSED torch::List<int64_t> MakeArgForConv1d(const torch::List<int64_t>& arg,
+                                             int64_t base_value) {
+  TORCH_CHECK(!arg.empty(), "Argument must have elements.");
+  torch::List<int64_t> result({arg.get(0), base_value});
+  if (arg.size() == 1) {
+    result[1] = arg.get(0);
+  } else {
+    result[1] = arg.get(1);
+  }
+  result[kConv1dSqueezeDim] = base_value;
+  return result;
+}
+
+// The range for using FP16 quantization of weights requires that the elements
+// should be in the range of [5.96e-8, 65504]. If it is out of range, then the
+// number will be saturated to max or min representable values by FP16.
+inline void HandleWeightsSaturation(int64_t N, float* weight) {
+  const float kFp16Max = RawUint16ToFp16(0x7BFF);
+  bool found_out_of_range = false;
+  for (const auto i : c10::irange(N)) {
+    bool saturate = CheckAndSaturate<float>(kFp16Max, weight + i);
+    if (saturate) {
+      found_out_of_range = true;
+    }
+  }
+  if (found_out_of_range) {
+    TORCH_WARN("FOUND weight out of range ");
+  }
+}
+
+// Util function for quantizing bias.
+inline at::Tensor QuantizeBias(
+    bool is_per_channel,
+    const at::Tensor& bias,
+    const at::Tensor& weight_contig,
+    double input_scale) {
+  at::Tensor qbias;
+  if (is_per_channel) {
+    auto bias_quant_scales =
+        weight_contig.q_per_channel_scales() * input_scale;
+    auto bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
+    qbias = at::native::quantize_per_channel(
+        bias, bias_quant_scales, bias_zp, 0, c10::kQInt32);
+  } else {
+    qbias = at::native::quantize_per_tensor(
+        bias, weight_contig.q_scale() * input_scale, 0, c10::kQInt32);
+  }
+  return qbias;
+}
+
+} // namespace quant_utils
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h
new file mode 100644
index 0000000000000000000000000000000000000000..9257f57b65dcd9355f0c6a7c35cecb1805e13baa
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h
@@ -0,0 +1,258 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/core/IListRef.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/native/Activation.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+namespace native {
+
+using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/,
+                                const Scalar& /*negval_*/);
+using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */);
+using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point);
+using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qclamp_fn = void (*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& min,
+    const Scalar& max,
+    at::Tensor& /*qy*/);
+using qclamp_minmax_fn = void (*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& /*min or max*/,
+    at::Tensor& /*qy*/);
+using qthreshold_fn = void (*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& threshold,
+    const Scalar& value,
+    at::Tensor& /*qy*/);
+using qtanh_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qelu_fn = void(*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& /*alpha*/,
+    const Scalar& /*scale*/,
+    const Scalar& /*input_scale*/,
+    at::Tensor& /*qy*/);
+using qbinary_fn =
+    void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Tensor& /*other*/);
+using qadd_scalar_fn =
+    void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Scalar& other /*other*/);
+using qhardswish_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/);
+using qdropout_fn = void(*)(
+    const at::Tensor& /*qx*/,
+    const Scalar& /*p*/,
+    bool training /*training*/,
+    at::Tensor& /*qy*/);
+using qmaxpool_2d_fn = void (*)(
+    const Tensor& qx,
+    int64_t iC, // input/output channels
+    int64_t iH,
+    int64_t iW, // input sizes
+    int64_t oH,
+    int64_t oW, // output sizes
+    int64_t kH,
+    int64_t kW, // kernel size
+    int64_t sH,
+    int64_t sW, // strides
+    int64_t pH,
+    int64_t pW, // padding
+    int64_t dH,
+    int64_t dW, // dilation
+    Tensor& qy);
+using qmaxpool_3d_fn = void (*)(
+    const Tensor& qx,
+    int64_t iC, // input/output channels
+    int64_t iT,
+    int64_t iH,
+    int64_t iW, // input sizes
+    int64_t oT,
+    int64_t oH,
+    int64_t oW, // output sizes
+    int64_t kT,
+    int64_t kH,
+    int64_t kW, // kernel size
+    int64_t sT,
+    int64_t sH,
+    int64_t sW, // strides
+    int64_t pT,
+    int64_t pH,
+    int64_t pW, // padding
+    int64_t dT,
+    int64_t dH,
+    int64_t dW, // dilation
+    Tensor& qy);
+using qadaptive_avg_pool2d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t sizeB,
+    int64_t sizeC,
+    int64_t isizeH,
+    int64_t isizeW,
+    int64_t osizeH,
+    int64_t osizeW,
+    int64_t istrideB,
+    int64_t istrideC,
+    int64_t istrideH,
+    int64_t istrideW);
+using qadaptive_avg_pool3d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t sizeB,
+    int64_t sizeC,
+    int64_t isizeD,
+    int64_t isizeH,
+    int64_t isizeW,
+    int64_t osizeD,
+    int64_t osizeH,
+    int64_t osizeW,
+    int64_t istrideB,
+    int64_t istrideC,
+    int64_t istrideD,
+    int64_t istrideH,
+    int64_t istrideW);
+using qavg_pool2d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t nBatch,
+    int64_t nInputPlane,
+    int64_t inputWidth,
+    int64_t inputHeight,
+    int64_t outputWidth,
+    int64_t outputHeight,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    bool count_include_pad,
+    std::optional<int64_t> divisor_override);
+
+using qavg_pool3d_fn = void (*)(
+    const Tensor& qx,
+    Tensor& qy,
+    int64_t nBatch,
+    int64_t nInputPlane,
+    int64_t inputWidth,
+    int64_t inputHeight,
+    int64_t inputDepth,
+    int64_t outputWidth,
+    int64_t outputHeight,
+    int64_t outputDepth,
+    int kW,
+    int kH,
+    int kD,
+    int dW,
+    int dH,
+    int dD,
+    int padW,
+    int padH,
+    int padD,
+    bool count_include_pad,
+    std::optional<int64_t> divisor_override);
+
+using qupsample_bilinear2d_fn = void (*)(
+    Tensor& output,
+    const Tensor& input,
+    int64_t input_height,
+    int64_t input_width,
+    int64_t output_height,
+    int64_t output_width,
+    int64_t nbatch,
+    int64_t channels,
+    bool align_corners,
+    std::optional<double> scales_h,
+    std::optional<double> scales_w);
+
+using qcat_nhwc_fn = Tensor (*)(
+    const MaterializedITensorListRef& qxs,
+    int64_t dim,
+    double scale,
+    int64_t zero_point);
+using qtopk_fn = void(*)(Tensor&, Tensor&, const Tensor&, int64_t, int64_t, bool, bool);
+
+using qbatch_norm_fn = void(*)(int64_t, int64_t, int64_t, int64_t, int64_t, const Tensor&, const Tensor&, const Tensor&, Tensor&);
+
+using qnormalize_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    bool /* affine_per_channel */,
+    int /* num_channels */,
+    int /* num_groups */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */);
+
+using qmean_inner_dim_fn = void (*)(
+    const Tensor& /* X */,
+    OptionalIntArrayRef /* opt_dim */,
+    bool /* keepdim */,
+    std::optional<ScalarType> /* opt_dtype */,
+    Tensor& /* Y */);
+
+using qstd_inner_dim_fn = void (*)(
+    const Tensor& /* X */,
+    OptionalIntArrayRef /* dim */,
+    const std::optional<Scalar>& /* correction */,
+    bool /* keepdim */,
+    Tensor& /* Y */);
+
+using qnormalize_nhwc_fn = void (*)(
+    const Tensor& /* X */,
+    const Tensor& /* gamma */,
+    const Tensor& /* beta */,
+    bool /* affine_per_channel */,
+    int /* num_channels */,
+    int /* num_groups */,
+    int64_t /* M */,
+    int64_t /* N */,
+    double /* eps */,
+    Tensor* /* Y */);
+
+using qprelu_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/,
+                           const Tensor& /*qw*/);
+
+DECLARE_DISPATCH(qadaptive_avg_pool2d_fn, qadaptive_avg_pool2d_nhwc_stub);
+DECLARE_DISPATCH(qadaptive_avg_pool3d_fn, qadaptive_avg_pool3d_ndhwc_stub);
+DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_relu_stub);
+DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_stub);
+DECLARE_DISPATCH(qavg_pool2d_fn, qavg_pool2d_nhwc_stub);
+DECLARE_DISPATCH(qavg_pool3d_fn, qavg_pool3d_nhwc_stub);
+DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_relu_stub);
+DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_stub);
+DECLARE_DISPATCH(qbinary_fn, qadd_relu_stub);
+DECLARE_DISPATCH(qbinary_fn, qadd_stub);
+DECLARE_DISPATCH(qbinary_fn, qmul_relu_stub);
+DECLARE_DISPATCH(qbinary_fn, qmul_stub);
+DECLARE_DISPATCH(qcat_nhwc_fn, qcat_nhwc_stub);
+DECLARE_DISPATCH(qcat_nhwc_fn, qcat_relu_nhwc_stub);
+DECLARE_DISPATCH(qclamp_fn, qclamp_stub);
+DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_min_stub);
+DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_max_stub);
+DECLARE_DISPATCH(qelu_fn, qelu_stub);
+DECLARE_DISPATCH(qhardsigmoid_fn, qhardsigmoid_stub);
+DECLARE_DISPATCH(qhardswish_fn, qhardswish_stub);
+DECLARE_DISPATCH(qdropout_fn, qdropout_stub);
+DECLARE_DISPATCH(qmaxpool_2d_fn, qmaxpool_2d_nhwc_stub);
+DECLARE_DISPATCH(qmaxpool_3d_fn, qmaxpool_3d_nthwc_stub);
+DECLARE_DISPATCH(qnormalize_fn, quantized_normalize_stub);
+DECLARE_DISPATCH(qnormalize_nhwc_fn, quantized_groupnorm_nhwc_stub);
+DECLARE_DISPATCH(qrelu_fn, qrelu_stub);
+DECLARE_DISPATCH(qrelu_leaky_fn, qrelu_leaky_stub);
+DECLARE_DISPATCH(qgelu_fn, qgelu_stub);
+DECLARE_DISPATCH(qsigmoid_fn, qsigmoid_stub);
+DECLARE_DISPATCH(qtanh_fn, qtanh_stub);
+DECLARE_DISPATCH(qthreshold_fn, qthreshold_stub);
+DECLARE_DISPATCH(qtopk_fn, qtopk_stub);
+DECLARE_DISPATCH(qupsample_bilinear2d_fn, qupsample_bilinear2d_nhwc_stub);
+DECLARE_DISPATCH(qmean_inner_dim_fn, qmean_inner_dim_stub);
+DECLARE_DISPATCH(qstd_inner_dim_fn, qstd_inner_dim_stub);
+DECLARE_DISPATCH(qprelu_fn, qprelu_stub);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..72abe1ad817f484e0d269b31cf78b98bf0694e5a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#ifdef USE_RUY_QMATMUL
+
+#include <ruy/ruy.h>
+
+namespace at {
+namespace native {
+namespace ruy_utils {
+
+ruy::Context* get_ruy_context();
+
+void quantize_multiplier(double scale,
+                         int* multiplier_fixedpoint,
+                         int* multiplier_exponent);
+
+} // namespace ruy_utils
+} // namespace native
+} // namespace
+
+#endif // USE_RUY_QMATMUL
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff334d4c8d48ceeb4fa83fdbcd2e678a3e2d887d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h
@@ -0,0 +1,335 @@
+#pragma once
+
+#ifdef USE_XNNPACK
+#include <cstdint>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/native/xnnpack/Common.h>
+
+using xnnpack_operator = at::native::xnnpack::Operator;
+
+namespace at {
+namespace native {
+namespace xnnp_utils {
+
+/*
+ * Return shape in the same order as the memory format
+ * e.g. channels_last will return NHWC instead of NCHW
+ */
+std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in);
+
+/*
+ * Input is always int8_t, output can be [int8_t, uint8_t].
+ * input  + offset = output
+ * int8_t + 128    = uint8_t
+ * int8_t + 0      = int8_t
+ */
+template <typename PT>
+void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out);
+
+template <int kSpatialDim>
+Tensor convert_conv_weights_to_channel_last_tensor(
+    const at::Tensor& src,
+    int groups,
+    bool transpose);
+
+/*
+ * Series of create wrapper functions to call xnn_create_[de]conv* functions.
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_create_convolution2d_nhwc(
+    uint32_t pad_top,
+    uint32_t pad_right,
+    uint32_t pad_bottom,
+    uint32_t pad_left,
+    uint32_t kernel_h,
+    uint32_t kernel_w,
+    uint32_t stride_h,
+    uint32_t stride_w,
+    uint32_t dilation_h,
+    uint32_t dilation_w,
+    uint32_t groups,
+    size_t group_input_channels,
+    size_t group_output_channels,
+    size_t ip_chan_stride,
+    size_t op_chan_stride,
+    int8_t izp,
+    float ip_scale,
+    int8_t kzp,
+    const float* k_scales,
+    const int8_t* kernel,
+    const int32_t* bias,
+    int8_t ozp,
+    float op_scale,
+    int8_t op_min,
+    int8_t op_max,
+    uint32_t flags,
+    xnn_operator_t* op,
+    bool per_channel,
+    bool transpose) {
+  /* Symmetric quantization forces kzp = 0 */
+  TORCH_CHECK(!kzp, "XNNPACK Q[SC]8 conv kernels expects kernel zero point to be zero."
+                    "But got: ", kzp);
+
+  if (transpose) {
+    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
+    return xnn_create_deconvolution2d_nhwc_qs8(
+        pad_top,        /* uint32_t output_padding_top          */
+        pad_right,      /* uint32_t output_padding_right        */
+        pad_bottom,     /* uint32_t output_padding_bottom       */
+        pad_left,       /* uint32_t output_padding_left         */
+        kernel_h,       /* uint32_t kernel_height               */
+        kernel_w,       /* uint32_t kernel_width                */
+        stride_h,       /* uint32_t stride_height               */
+        stride_w,       /* uint32_t stride_width                */
+        dilation_h,     /* uint32_t dilation_height             */
+        dilation_w,     /* uint32_t dilation_width              */
+        groups,         /* uint32_t groups                      */
+        group_input_channels,  /* size_t group_input_channels   */
+        group_output_channels, /* size_t group_output_channels  */
+        ip_chan_stride, /* size_t input_pixel_stride            */
+        op_chan_stride, /* size_t output_pixel_stride           */
+        izp,            /* int8_t input_zero_point              */
+        ip_scale,       /* float input_scale                    */
+        k_scales[0],    /* float kernel_scale                   */
+        kernel,         /* const int8_t* kernel                 */
+        bias,           /* const int32_t* bias                  */
+        ozp,            /* int8_t output_zero_point             */
+        op_scale,       /* float output_scale                   */
+        op_min,         /* int8_t output_min                    */
+        op_max,         /* int8_t output_max                    */
+        flags,          /* uint32_t flags                       */
+        nullptr,        /* xnn_caches_t caches                  */
+        nullptr,        /* xnn_weights_cache_t weights_cache    */
+        op);            /* xnn_operator_t* deconvolution_op_out */
+
+  }
+
+  if (!per_channel) {
+    return xnn_create_convolution2d_nhwc_qs8(
+        pad_top,        /* uint32_t input_padding_top         */
+        pad_right,      /* uint32_t input_padding_right       */
+        pad_bottom,     /* uint32_t input_padding_bottom      */
+        pad_left,       /* uint32_t input_padding_left        */
+        kernel_h,       /* uint32_t kernel_height             */
+        kernel_w,       /* uint32_t kernel_width              */
+        stride_h,       /* uint32_t subsampling_height        */
+        stride_w,       /* uint32_t subsampling_width         */
+        dilation_h,     /* uint32_t dilation_height           */
+        dilation_w,     /* uint32_t dilation_width            */
+        groups,         /* uint32_t groups                    */
+        group_input_channels,  /* size_t group_input_channels */
+        group_output_channels, /* size_t group_output_channels*/
+        ip_chan_stride, /* size_t input_channel_stride        */
+        op_chan_stride, /* size_t output_channel_stride       */
+        izp,            /* int8_t input_zero_point            */
+        ip_scale,       /* float input_scale                  */
+        k_scales[0],    /* float kernel_scale                 */
+        kernel,         /* const int8_t* kernel               */
+        bias,           /* const int32_t* bias                */
+        ozp,            /* int8_t output_zero_point           */
+        op_scale,       /* float output_scale                 */
+        op_min,         /* int8_t output_min                  */
+        op_max,         /* int8_t output_max                  */
+        flags,          /* uint32_t flags                     */
+        nullptr,        /* xnn_caches_t caches                */
+        nullptr,        /* xnn_weights_cache_t weights_cache    */
+        op);            /* xnn_operator_t* convolution_op_out */
+  } else { /* per_channel */
+    return xnn_create_convolution2d_nhwc_qs8_qc8w(
+        pad_top,        /* uint32_t input_padding_top         */
+        pad_right,      /* uint32_t input_padding_right       */
+        pad_bottom,     /* uint32_t input_padding_bottom      */
+        pad_left,       /* uint32_t input_padding_left        */
+        kernel_h,       /* uint32_t kernel_height             */
+        kernel_w,       /* uint32_t kernel_width              */
+        stride_h,       /* uint32_t subsampling_height        */
+        stride_w,       /* uint32_t subsampling_width         */
+        dilation_h,     /* uint32_t dilation_height           */
+        dilation_w,     /* uint32_t dilation_width            */
+        groups,         /* uint32_t groups                    */
+        group_input_channels,  /* size_t group_input_channels */
+        group_output_channels, /* size_t group_output_channels*/
+        ip_chan_stride, /* size_t input_channel_stride        */
+        op_chan_stride, /* size_t output_channel_stride       */
+        izp,            /* int8_t input_zero_point            */
+        ip_scale,       /* float input_scale                  */
+        k_scales,       /* const float* kernel_scale          */
+        kernel,         /* const int8_t* kernel               */
+        bias,           /* const int32_t* bias                */
+        ozp,            /* int8_t output_zero_point           */
+        op_scale,       /* float output_scale                 */
+        op_min,         /* int8_t output_min                  */
+        op_max,         /* int8_t output_max                  */
+        flags,          /* uint32_t flags                     */
+        nullptr,        /* xnn_caches_t caches                */
+        nullptr,        /* xnn_weights_cache_t weights_cache    */
+        op);            /* xnn_operator_t* convolution_op_out */
+  }
+}
+
+/*
+ * Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions.
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_reshape_convolution2d_nhwc(
+    xnn_operator_t op,
+    size_t batch,
+    size_t in_h,
+    size_t in_w,
+    pthreadpool_t pt_pool,
+    bool per_channel = false,
+    bool transpose = false,
+    uint32_t adj_h = 0,
+    uint32_t adj_w = 0) {
+  if(transpose) {
+    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
+    return xnn_reshape_deconvolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t deconvolution_op */
+        batch,    /* size_t batch_size               */
+        in_h,     /* size_t input_height             */
+        in_w,     /* size_t input_width              */
+        adj_h,    /* uint32_t adjustment_height      */
+        adj_w,    /* uint32_t adjustment_width       */
+        nullptr,  /* size_t* output_height_out       */
+        nullptr,  /* size_t* output_width_out        */
+        pt_pool); /* pthreadpool_t threadpool        */
+  }
+
+  size_t workspace_size = SIZE_MAX;
+  size_t workspace_alignment = SIZE_MAX;
+
+  if (!per_channel) {
+    return xnn_reshape_convolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t convolution_op */
+        batch,    /* size_t batch_size             */
+        in_h,     /* size_t input_height           */
+        in_w,     /* size_t input_width            */
+        &workspace_size, /* size_t* workspace_size */
+        &workspace_alignment, /* size_t* workspace_alignment */
+        nullptr,  /* size_t* output_height_out     */
+        nullptr,  /* size_t* output_width_out      */
+        pt_pool); /* pthreadpool_t threadpool      */
+  } else { /* per_channel */
+    return xnn_reshape_convolution2d_nhwc_qs8_qc8w(
+        op,       /* xnn_operator_t convolution_op */
+        batch,    /* size_t batch_size             */
+        in_h,     /* size_t input_height           */
+        in_w,     /* size_t input_width            */
+        &workspace_size, /* size_t* workspace_size */
+        &workspace_alignment, /* size_t* workspace_alignment */
+        nullptr,  /* size_t* output_height_out     */
+        nullptr,  /* size_t* output_width_out      */
+        pt_pool); /* pthreadpool_t threadpool      */
+  }
+}
+
+
+/*
+ * Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_setup_convolution2d_nhwc(
+    xnn_operator_t op,
+    const int8_t* inp,
+    int8_t* outp,
+    bool per_channel = false,
+    bool transpose = false) {
+  if(transpose) {
+    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
+
+    return xnn_setup_deconvolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t deconvolution_op */
+        inp,      /* const int8_t* input             */
+        outp);    /* int8_t* output                  */
+  }
+
+  if (!per_channel) {
+    return xnn_setup_convolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t deconvolution_op */
+        nullptr,  /* void workspace                  */
+        inp,      /* const int8_t* input             */
+        outp);    /* int8_t* output                  */
+  } else { /* per_channel */
+    return xnn_setup_convolution2d_nhwc_qs8_qc8w(
+        op,       /* xnn_operator_t deconvolution_op */
+        nullptr,  /* void workspace                  */
+        inp,      /* const int8_t* input             */
+        outp);    /* int8_t* output                  */
+  }
+}
+
+
+/*
+ * Series of wrapper functions to call xnn_create* and xnn_setup*
+ * functions for linear
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_create_fully_connected_nc(
+    size_t input_channels,
+    size_t output_channels,
+    size_t input_stride,
+    size_t output_stride,
+    int8_t input_zero_point,
+    float input_scale,
+    int8_t kernel_zero_point,
+    float kernel_scale,
+    const int8_t* kernel,
+    const int32_t* bias,
+    int8_t output_zero_point,
+    float output_scale,
+    int8_t output_min,
+    int8_t output_max,
+    uint32_t flags,
+    xnn_operator_t* fully_connected_op_out) {
+  /* Symmetric quantization forces kzp = 0 */
+  TORCH_CHECK(!kernel_zero_point, "XNNPACK QS8 linear kernel expects kernel zero point to be zero."
+                    "But got: ", kernel_zero_point);
+  return xnn_create_fully_connected_nc_qs8(
+      input_channels,          /* size_t input_channels                  */
+      output_channels,         /* size_t output_channels                 */
+      input_stride,            /* size_t input_stride                    */
+      output_stride,           /* size_t output_stride                   */
+      input_zero_point,        /* int8_t input_zero_point                */
+      input_scale,             /* float input_scale                      */
+      kernel_scale,            /* float kernel_scale                     */
+      kernel,                  /* const int8_t* kernel                   */
+      bias,                    /* const int32_t* bias                    */
+      output_zero_point,       /* int8_t output_zero_point               */
+      output_scale,            /* float output_scale                     */
+      output_min,              /* int8_t output_min                      */
+      output_max,              /* int8_t output_max                      */
+      flags,                   /* uint32_t flags                         */
+      nullptr,                 /* xnn_caches_t caches                    */
+      nullptr,                 /* xnn_weights_cache_t                    */
+      fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
+}
+
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_reshape_fully_connected_nc(
+    xnn_operator_t fully_connected_op,
+    size_t batch_size,
+    pthreadpool_t threadpool) {
+  return xnn_reshape_fully_connected_nc_qs8(
+      fully_connected_op, /* xnn_operator_t fully_connected_op */
+      batch_size,         /* size_t batch_size                 */
+      threadpool);        /* pthreadpool_t threadpool          */
+}
+
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_setup_fully_connected_nc(
+    xnn_operator_t fully_connected_op,
+    const int8_t* input,
+    int8_t* output) {
+  return xnn_setup_fully_connected_nc_qs8(
+      fully_connected_op, /* xnn_operator_t fully_connected_op */
+      input,              /* const int8_t* input               */
+      output              /* int8_t* output                    */
+    );
+}
+
+} // namespace xnnp_utils
+} // namespace native
+} // namespace at
+
+#endif // USE_XNNPACK
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..85451fb57482a31c2165366505993c0ef67aa920
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h
@@ -0,0 +1,414 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/List.h>
+#include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/QnnpackUtils.h>
+#include <ATen/native/quantized/cpu/OnednnUtils.h>
+#include <c10/util/irange.h>
+#if !defined(__s390x__) && !defined(__powerpc__)
+#include <cpuinfo.h>
+#endif
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/from_blob.h>
+#endif
+
+
+#include <tuple>
+
+/* Convolution prepacked parameters serialization.
+ *
+ * Version 1
+ *
+ * - Fields:
+ *  1. weight
+ *  2. bias
+ *  3. stride x kSpatialDim
+ *  4. padding x kSpatialDim
+ *  5. dilation x kSpatialDim
+ *  6. groups
+ *
+ * Version 2
+ *
+ * - Fields:
+ *  0. version (string)
+ *  1. list of non-optional tensors
+ *    0: packed parameters (int16_t)
+ *      - kSpatialDim
+ *      - stride x kSpatialDim
+ *      - padding x kSpatialDim
+ *      - dilation x kSpatialDim
+ *      - output_padding x kSpatialDim
+ *      - groups
+ *      - transpose (0 or 1)
+ *    1: weight
+ *  2. list of optional tensors
+ *    0: bias
+ *
+ * Version 3
+ *
+ * - Fields:
+ *  0. version (int64_t)
+ *  1. list of int64_t configuration values
+ *    - kSpatialDim
+ *    - stride x kSpatialDim
+ *    - padding x kSpatialDim
+ *    - dilation x kSpatialDim
+ *    - output_padding x kSpatialDim
+ *    - groups
+ *    - flags (bitmask)
+ *      - (1 << 0) transpose (1 = yes)
+ *  2. list of optional tensors
+ *    0: None (helps with type inference)
+ *    1: weight (this must be present)
+ *    2: bias
+ */
+
+using ConvParamsSerializationTypeV2 = std::tuple<
+  // version, for versions 2 and up
+  std::string,
+  // non-optional tensors
+  std::vector<at::Tensor>,
+  // optional tensors
+  std::vector<std::optional<at::Tensor>>>;
+
+using ConvParamsSerializationTypeV3 = std::tuple<
+  // version, int for versions 3 and up
+  int64_t,
+  // configuration values
+  std::vector<int64_t>,
+  // optional tensors
+  std::vector<std::optional<at::Tensor>>>;
+
+// Parses any historical conv packed params format into
+// the current format.
+template <uint32_t kSpatialDim>
+ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) {
+
+  // determine the version based on IValue contents
+  int version = -1;
+  if (v.isTuple()) {
+    const auto& elements = v.toTupleRef().elements();
+    if (!elements.empty()) {
+      auto firstElement = elements[0];
+      if (firstElement.isTensor()) {
+        version = 1;
+      } else if (firstElement.isString()) {
+        const std::string& version_str = firstElement.toStringRef();
+        // note: not parsing the string to automatically handle bad
+        // inputs
+        if (version_str == "2") {
+          version = 2;
+        }
+      } else if (firstElement.isInt()) {
+        auto raw_version = firstElement.toInt();
+        if (raw_version == 3) {
+          version = 3;
+        }
+      }
+    }
+  }
+  TORCH_INTERNAL_ASSERT(version != -1, "Unable to parse serialization version");
+
+  if (version == 1) {
+    // version 1 - convert to version 3 manually
+
+    const auto& elements = v.toTupleRef().elements();
+
+    at::Tensor weight = elements[0].toTensor();
+    std::optional<at::Tensor> bias = elements[1].toOptional<at::Tensor>();
+    torch::List<at::Tensor> stride_x_kSpatialDim = elements[2].toTensorList();
+    torch::List<at::Tensor> padding_x_kSpatialDim = elements[3].toTensorList();
+    torch::List<at::Tensor> dilation_x_kSpatialDim = elements[4].toTensorList();
+    at::Tensor groups = elements[5].toTensor();
+
+    std::vector<int64_t> config_vals;
+    config_vals.reserve(
+        stride_x_kSpatialDim.size() + padding_x_kSpatialDim.size() +
+        dilation_x_kSpatialDim.size() + kSpatialDim + 3);
+    config_vals.push_back(kSpatialDim);
+    for (const auto i : c10::irange(stride_x_kSpatialDim.size())) {
+      auto stride = stride_x_kSpatialDim.get(i);
+      config_vals.push_back(stride[0].item<int16_t>());
+    }
+    for (const auto i : c10::irange(padding_x_kSpatialDim.size())) {
+      auto padding = padding_x_kSpatialDim.get(i);
+      config_vals.push_back(padding[0].item<int16_t>());
+    }
+    for (const auto i : c10::irange(dilation_x_kSpatialDim.size())) {
+      auto dilation = dilation_x_kSpatialDim.get(i);
+      config_vals.push_back(dilation[0].item<int16_t>());
+    }
+    // output_padding does not exist in v1, so we fill in a default value
+    for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+      config_vals.push_back(0);
+    }
+    config_vals.push_back(groups[0].item<int16_t>());
+    // transpose does not exist in v1, so we fill in a default value
+    config_vals.push_back(0);
+
+    std::vector<std::optional<at::Tensor>> tensors;
+    tensors.emplace_back();
+    tensors.emplace_back(weight);
+    tensors.emplace_back(bias);
+
+    int64_t version = 3;
+    return std::tie(version, config_vals, tensors);
+  } else if (version == 2) {
+    // version 2
+    const auto& elements = v.toTupleRef().elements();
+    std::vector<at::Tensor> non_optional = elements[1].toTensorList().vec();
+    std::vector<std::optional<at::Tensor>> optional;
+
+    if (elements[2].isTensorList()) {
+      for (const auto& elem : elements[2].toTensorList()) {
+        optional.emplace_back(static_cast<at::Tensor>(elem));
+      }
+    } else {
+      for (const auto& elem : elements[2].toList()) {
+        optional.emplace_back(static_cast<c10::IValue>(elem).toOptional<at::Tensor>());
+      }
+    }
+    // create default optional value for bias
+    if (optional.empty()) {
+      optional.emplace_back();
+    }
+
+    auto config_a = non_optional[0].accessor<int16_t, 1>();
+    std::vector<int64_t> config_vals;
+    config_vals.reserve(config_a.size(0));
+    for (const auto i : c10::irange(config_a.size(0))) {
+      config_vals.emplace_back(config_a[i]);
+    }
+
+    auto weight = non_optional[1];
+    auto bias = optional[0];
+
+    std::vector<std::optional<at::Tensor>> tensors;
+    tensors.emplace_back();
+    tensors.emplace_back(weight);
+    tensors.emplace_back(bias);
+
+    int64_t version = 3;
+    return std::tie(version, config_vals, tensors);
+  } else if (version == 3) {
+    return v.to<ConvParamsSerializationTypeV3>();
+  } else {
+    TORCH_INTERNAL_ASSERT(false, "Unexpected serialized qconv version: ",
+        version);
+  }
+}
+
+#define QCONV_SERIALIZATION_VERSION 2
+
+#if QCONV_SERIALIZATION_VERSION == 2
+using ConvParamsSerializationType = ConvParamsSerializationTypeV2;
+
+template <uint32_t kSpatialDim>
+ConvParamsSerializationTypeV2 serialize_conv(
+    const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& params) {
+
+  std::string version = "2";
+  std::vector<at::Tensor> non_optional;
+  std::vector<std::optional<at::Tensor>> optional;
+
+  // create a packed int8_t tensor for conv params
+  std::vector<int16_t> params_vec;
+  params_vec.push_back(kSpatialDim);
+  auto stride = params->stride().vec();
+  params_vec.insert(params_vec.end(), stride.begin(), stride.end());
+  auto padding = params->padding().vec();
+  params_vec.insert(params_vec.end(), padding.begin(), padding.end());
+  auto dilation = params->dilation().vec();
+  params_vec.insert(params_vec.end(), dilation.begin(), dilation.end());
+  auto output_padding = params->output_padding().vec();
+  params_vec.insert(params_vec.end(), output_padding.begin(),
+                    output_padding.end());
+  params_vec.push_back(params->groups());
+  params_vec.push_back(params->transpose());
+  int64_t vec_size = params_vec.size();
+  at::Tensor params_tensor = at::from_blob(
+      params_vec.data(), {vec_size},
+      at::TensorOptions().dtype(at::kShort))
+    // clone to retain ownership of the data
+    .clone();
+
+  auto [weight, bias] = params->unpack();
+
+  non_optional.emplace_back(std::move(params_tensor));
+  non_optional.emplace_back(std::move(weight));
+  optional.emplace_back(std::move(bias));
+
+  return std::tie(version, non_optional, optional);
+}
+
+#elif QCONV_SERIALIZATION_VERSION == 3
+using ConvParamsSerializationType = ConvParamsSerializationTypeV3;
+
+template <uint32_t kSpatialDim>
+ConvParamsSerializationTypeV3 serialize_conv(
+    const c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>>& params) {
+  std::vector<int64_t> config_vals;
+  config_vals.push_back(kSpatialDim);
+  auto stride = params->stride().vec();
+  config_vals.insert(config_vals.end(), stride.begin(), stride.end());
+  auto padding = params->padding().vec();
+  config_vals.insert(config_vals.end(), padding.begin(), padding.end());
+  auto dilation = params->dilation().vec();
+  config_vals.insert(config_vals.end(), dilation.begin(), dilation.end());
+  auto output_padding = params->output_padding().vec();
+  config_vals.insert(config_vals.end(), output_padding.begin(),
+                    output_padding.end());
+  config_vals.push_back(params->groups());
+  config_vals.push_back(params->transpose());
+
+  auto [weight, bias] = params->unpack();
+
+  std::vector<std::optional<at::Tensor>> tensors;
+  tensors.emplace_back();
+  tensors.emplace_back(weight);
+  tensors.emplace_back(bias);
+
+  int64_t version = 3;
+  return std::tie(version, config_vals, tensors);
+}
+
+#else
+#error "Invalid qconv serialization version."
+#endif
+
+template <uint32_t kSpatialDim>
+c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> deserialize_conv(
+    ConvParamsSerializationTypeV3 state) {
+  auto [version, config_vals, tensors] = state;
+  TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version);
+
+  TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size());
+  std::optional<at::Tensor> weight = tensors[1];
+  std::optional<at::Tensor> bias = tensors[2];
+  TORCH_INTERNAL_ASSERT(weight, "Weight should always be present in serialized qconv.");
+
+  torch::List<int64_t> stride, padding, output_padding, dilation;
+  // skip kSpatialDim
+  int idx = 1;
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    stride.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    padding.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    dilation.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) {
+    TORCH_INTERNAL_ASSERT(idx < static_cast<int64_t>(config_vals.size()),
+        "Unexpected index = ", idx, " for config_vals of size ",
+        config_vals.size());
+    output_padding.emplace_back(config_vals.at(idx));
+    idx++;
+  }
+  int64_t groups = config_vals.at(idx);
+  idx++;
+  int64_t flags = config_vals.at(idx);
+  idx++;
+  TORCH_INTERNAL_ASSERT(idx == static_cast<int64_t>(config_vals.size()),
+      "Unexpected length of config_vals, expected ",
+      idx,
+      " got ",
+      config_vals.size());
+
+  bool transpose = flags & (1 << 0);
+
+  int64_t other_flags = flags & ~(1 << 0);
+  TORCH_INTERNAL_ASSERT(other_flags == 0, "Unexpected flags set in ", flags, ".");
+
+  auto& ctx = at::globalContext();
+
+#ifdef USE_FBGEMM
+  if (ctx.qEngine() == at::QEngine::X86) {
+#if AT_MKLDNN_ENABLED()
+    bool use_onednn = onednn_utils::should_use_onednn_quant(
+        weight.value(), transpose, groups, output_padding);
+    if (use_onednn) {
+      return PackedConvWeightsOnednn<kSpatialDim>::prepack(
+        weight.value(),
+        bias,
+        stride,
+        padding,
+        output_padding,
+        dilation,
+        groups,
+        transpose
+      );
+    }
+#endif
+    return PackedConvWeight<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  } // x86
+#endif
+
+#ifdef USE_FBGEMM
+  if (ctx.qEngine() == at::QEngine::FBGEMM) {
+    return PackedConvWeight<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // USE_FBGEMM
+#ifdef USE_PYTORCH_QNNPACK
+  if (ctx.qEngine() == at::QEngine::QNNPACK) {
+    TORCH_CHECK(
+        kSpatialDim == 2,
+        "prepack/__setstate__: QNNPACK only supports Conv2d "
+        "now.");
+    return PackedConvWeightsQnnp<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // USE_PYTORCH_QNNPACK
+#if AT_MKLDNN_ENABLED()
+  if (ctx.qEngine() == at::QEngine::ONEDNN) {
+    return PackedConvWeightsOnednn<kSpatialDim>::prepack(
+      weight.value(),
+      bias,
+      stride,
+      padding,
+      output_padding,
+      dilation,
+      groups,
+      transpose
+    );
+  }
+#endif // AT_MKLDNN_ENABLED()
+TORCH_CHECK(
+  false,
+  "Didn't find engine for when deserializing ConvPackedParams: ",
+  toString(ctx.qEngine()));
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..407d6550574dc2cef42a5db7b9ffc5df92cfa2f9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h
@@ -0,0 +1,413 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ATen/native/quantized/cpu/EmbeddingPackedParams.h>
+#include <c10/core/QScheme.h>
+#include <c10/util/irange.h>
+
+#ifdef USE_FBGEMM
+#include <fbgemm/Fbgemm.h>
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Winconsistent-missing-destructor-override")
+#include <fbgemm/FbgemmFP16.h>
+C10_DIAGNOSTIC_POP()
+#include <fbgemm/QuantUtils.h>
+
+// The struct for the packed weight matrix (PackBMatrix) and the corresponding
+// column offsets used for the fully connect layer, which are both prepared in
+// the prepacking step to save the computations in the inference. Note the
+// column offsets include the sum of the B columns as well as the scalar term
+// B_zero_point * K, whereas the row offsets created by
+// PackAWithQuantRowOffset/PackAWithIm2Col/PackAWithRowOffset are only the sum
+// of the A rows. The column offsets are needed for the asymmetric quantization
+// (affine quantization) of input matrix.
+// Note that in JIT mode we can think of a way to fuse col_offsets with bias.
+struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
+  PackedLinearWeight(
+      std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w,
+      std::optional<at::Tensor> bias,
+      std::vector<int32_t> col_offsets,
+      std::vector<float> w_scale,
+      std::vector<int32_t> w_zp,
+      c10::QScheme q_scheme)
+      : w(std::move(w)),
+        bias_(std::move(bias)),
+        col_offsets(std::move(col_offsets)),
+        w_scale(std::move(w_scale)),
+        w_zp(std::move(w_zp)),
+        q_scheme(std::move(q_scheme)) {}
+  std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w;
+  std::optional<at::Tensor> bias_;
+  std::vector<int32_t> col_offsets;
+  std::vector<float> w_scale;
+  std::vector<int32_t> w_zp;
+  c10::QScheme q_scheme;
+
+  at::Tensor apply(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      at::Tensor input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor& apply_out(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output) override;
+
+  at::Tensor& apply_relu_out(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output) override;
+
+  at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
+  at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32(
+      at::Tensor input,
+      double input_scale,
+      int64_t input_zero_point) override;
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false)
+      override;
+
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
+      override;
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  std::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias);
+
+ private:
+  template <bool ReluFused>
+  at::Tensor& apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point,
+      at::Tensor& output);
+
+  template <bool ReluFused>
+  at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32_impl(
+      const at::Tensor& input,
+      double input_scale,
+      int64_t input_zero_point);
+
+  template <bool ReluFused>
+  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false);
+};
+
+struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase {
+  PackedLinearWeightFp16(
+      std::unique_ptr<fbgemm::PackedGemmMatrixFP16> w,
+      std::optional<at::Tensor> bias)
+      : w(std::move(w)), bias_(std::move(bias)) {}
+
+  std::unique_ptr<fbgemm::PackedGemmMatrixFP16> w;
+  std::optional<at::Tensor> bias_;
+
+  at::Tensor apply(
+      at::Tensor /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/) override {
+    TORCH_INTERNAL_ASSERT(false);
+  }
+  at::Tensor apply_relu(
+      at::Tensor /*input*/,
+      double /*output_scale*/,
+      int64_t /*output_zero_point*/) override {
+    TORCH_INTERNAL_ASSERT(false);
+  }
+
+  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false)
+      override;
+  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
+      override;
+
+  at::Tensor& apply_dynamic_out(
+      const at::Tensor& input,
+      at::Tensor& output,
+      bool reduce_range = false) override;
+  at::Tensor& apply_dynamic_relu_out(
+      const at::Tensor& input,
+      at::Tensor& output,
+      bool reduce_range = false) override;
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  std::optional<at::Tensor> bias() override {
+    return bias_;
+  }
+
+  static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias);
+
+  void set_bias(std::optional<at::Tensor> bias) override;
+
+ private:
+  template <bool ReluFused>
+  at::Tensor& apply_dynamic_impl(const at::Tensor& input, at::Tensor& output);
+};
+
+template <int kSpatialDim = 2>
+struct TORCH_API PackedConvWeight : public ConvPackedParamsBase<kSpatialDim> {
+  PackedConvWeight(
+      std::unique_ptr<fbgemm::PackWeightsForConv<kSpatialDim>> w,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      uint8_t transpose,
+      std::vector<int32_t> col_offsets,
+      std::vector<int64_t> kernel,
+      std::vector<float> w_scale,
+      std::vector<int32_t> w_zp,
+      c10::QScheme q_scheme)
+      : w(std::move(w)),
+        bias(std::move(bias)),
+        stride_(std::move(stride)),
+        padding_(std::move(padding)),
+        output_padding_(std::move(output_padding)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        transpose_(transpose),
+        col_offsets(std::move(col_offsets)),
+        kernel(std::move(kernel)),
+        w_scale(std::move(w_scale)),
+        w_zp(std::move(w_zp)),
+        q_scheme(q_scheme) {}
+
+  std::unique_ptr<fbgemm::PackWeightsForConv<kSpatialDim>> w;
+  std::optional<at::Tensor> bias;
+  torch::List<int64_t> stride_;
+  torch::List<int64_t> padding_;
+  torch::List<int64_t> output_padding_;
+  torch::List<int64_t> dilation_;
+  int64_t groups_;
+  uint8_t transpose_;
+  std::vector<int32_t> col_offsets;
+  std::vector<int64_t> kernel;
+  std::vector<float> w_scale;
+  std::vector<int32_t> w_zp;
+  c10::QScheme q_scheme;
+
+  at::Tensor apply(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_relu(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point) override;
+
+  at::Tensor apply_dynamic(
+    const at::Tensor& input,
+    bool reduce_range) override;
+
+  std::tuple<at::Tensor, std::optional<at::Tensor>> unpack() override;
+
+  static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
+      at::Tensor weight,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> output_padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      bool transpose);
+
+  const float* GetBiasData(at::Tensor* bias);
+
+  void GetQuantizationParams(
+      float act_scale,
+      float out_scale,
+      std::vector<float>* output_multiplier_float,
+      std::vector<float>* act_times_w_scale);
+
+  torch::List<int64_t> stride() const override {
+    return stride_;
+  }
+
+  torch::List<int64_t> padding() const override {
+    return padding_;
+  }
+
+  torch::List<int64_t> output_padding() const override {
+    return output_padding_;
+  }
+
+  torch::List<int64_t> dilation() const override {
+    return dilation_;
+  }
+
+  int64_t groups() const override {
+    return groups_;
+  }
+
+  bool transpose() const override {
+    return (bool)transpose_;
+  }
+
+ private:
+  template <bool ReluFused>
+  at::Tensor apply_impl(
+      const at::Tensor& input,
+      double output_scale,
+      int64_t output_zero_point);
+};
+
+// PackWeight: Convert the weight from uint8 to int8.
+inline void convert_uint8_int8(
+    int len,
+    const uint8_t* src_uint8,
+    int8_t* dst_int8) {
+  for (const auto i : c10::irange(len)) {
+    dst_int8[i] = static_cast<int8_t>(static_cast<int32_t>(src_uint8[i]) - 128);
+  }
+}
+
+// UnpackWeight: Convert the weight from int8 to uint8.
+inline void convert_int8_uint8(
+    int len,
+    const int8_t* src_int8,
+    uint8_t* dst_uint8) {
+  for (const auto i : c10::irange(len)) {
+    dst_uint8[i] =
+        static_cast<uint8_t>(static_cast<int32_t>(src_int8[i]) + 128);
+  }
+}
+
+namespace at {
+namespace native {
+namespace fbgemm_utils {
+
+template <int kSpatialDim = 2>
+fbgemm::conv_param_t<kSpatialDim> MakeFbgemmConvParam(
+    int N,
+    int C,
+    int M,
+    const std::vector<int>& image_shape,
+    int groups,
+    const std::vector<int>& kernels,
+    const std::vector<int>& strides,
+    const std::vector<int>& pads,
+    const std::vector<int>& dilations,
+    const std::vector<int>& output_padding = std::vector<int>(kSpatialDim, 0),
+    bool transposed = false);
+
+// TODO: Remove functions below when ChannelsLast3d is ready.
+Tensor MakeStridedQTensorCPU(
+    const IntArrayRef& sizes,
+    const IntArrayRef& strides,
+    const TensorOptions& options,
+    QuantizerPtr quantizer);
+
+Tensor MakeEmptyAffineQuantizedChannelsLast3dTensor(
+    int64_t N,
+    int64_t C,
+    int64_t D,
+    int64_t H,
+    int64_t W,
+    const TensorOptions& options,
+    double scale,
+    int64_t zero_point);
+
+Tensor MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor(
+    int64_t N,
+    int64_t C,
+    int64_t D,
+    int64_t H,
+    int64_t W,
+    const TensorOptions& options,
+    const Tensor& scales,
+    const Tensor& zero_points);
+
+Tensor ConvertToChannelsLast3dTensor(const Tensor& src);
+
+template <int kSpatialDim = 2>
+Tensor TransposeConvTensorUnpackConversion(const Tensor& src, int groups);
+
+template <int kSpatialDim>
+Tensor ConvertConvWeightsToChannelLastTensor(
+    const at::Tensor& src,
+    int groups,
+    bool transpose);
+} // namespace fbgemm_utils
+} // namespace native
+} // namespace at
+
+#endif // USE_FBGEMM
+
+struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase {
+  PackedEmbeddingBagWeight(
+      at::Tensor packed_w,
+      std::vector<float> w_scale,
+      std::vector<float> w_zp,
+      int64_t bit_rate,
+      c10::QScheme q_scheme,
+      int64_t version)
+      : packed_w(std::move(packed_w)),
+        w_scale(std::move(w_scale)),
+        w_zp(std::move(w_zp)),
+        bit_rate_(bit_rate),
+        q_scheme(q_scheme),
+        version_(version) {
+    // NOLINTNEXTLINE(clang-analyzer-cplusplus.Move)
+    if (!packed_w.is_contiguous()) {
+      packed_w = packed_w.contiguous();
+    }
+  }
+
+  at::Tensor packed_w;
+  std::vector<float> w_scale;
+  std::vector<float> w_zp;
+  int64_t bit_rate_;
+  c10::QScheme q_scheme;
+  int64_t version_;
+
+  at::Tensor unpack() override;
+  static c10::intrusive_ptr<EmbeddingPackedParamsBase> prepack(
+      at::Tensor weight);
+
+  int64_t bit_rate() const override {
+    return bit_rate_;
+  }
+
+  int64_t version() const override {
+    return version_;
+  }
+
+  at::Tensor embeddingbag_byte(
+      const at::Tensor& indices,
+      const std::optional<at::Tensor>& offsets,
+      bool pruned_weights,
+      const std::optional<at::Tensor>& per_sample_weights_,
+      const std::optional<at::Tensor>& compressed_indices_mapping,
+      bool include_last_offset,
+      bool is_embedding_op) override;
+
+  at::Tensor embeddingbag_4bit(
+      const at::Tensor& indices,
+      const std::optional<at::Tensor>& offsets,
+      bool pruned_weights,
+      const std::optional<at::Tensor>& per_sample_weights_,
+      const std::optional<at::Tensor>& compressed_indices_mapping,
+      bool include_last_offset,
+      bool is_embedding_op) override;
+};
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbfb406ea55dbb50f97b1e86efb52c337af04847
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef USE_PYTORCH_QNNPACK
+
+namespace at {
+namespace native {
+
+void initQNNPACK();
+
+} // namespace native
+} // namespace at
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h
new file mode 100644
index 0000000000000000000000000000000000000000..644d85fa357ee21140986a2a225dfa937a68437e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h
@@ -0,0 +1,34 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <cstdint>
+
+namespace at {
+namespace native {
+Tensor& embedding_bag_byte_rowwise_offsets_out(
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const std::optional<Tensor>& offsets_in,
+    const bool /* scale_grad_by_freq */,
+    const int64_t /* mode */,
+    bool pruned_weights,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
+    bool include_last_offset);
+
+Tensor& embedding_bag_4bit_rowwise_offsets_out(
+    Tensor& output,
+    const Tensor& weight,
+    const Tensor& indices,
+    const std::optional<Tensor>& offsets_in,
+    const bool /* scale_grad_by_freq */,
+    const int64_t /* mode */,
+    bool pruned_weights,
+    const std::optional<Tensor>& per_sample_weights_,
+    const std::optional<Tensor>& compressed_indices_mapping,
+    bool include_last_offset);
+
+Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weight);
+
+} // native
+} // at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a65f3f07f397b931c1a4b6bd781e6308643117f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace at { namespace native {
+
+Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight);
+
+Tensor qembeddingbag_byte_prepack(const Tensor& weight);
+
+Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/attention.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/attention.h
new file mode 100644
index 0000000000000000000000000000000000000000..49fbdc46ee2a687ad60b3233ed0be99737b51cf5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/attention.h
@@ -0,0 +1,72 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/macros/Export.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/transformers/attention.h>
+#include <optional>
+
+namespace at {
+namespace native {
+
+using fused_sdp_choice_fn = int64_t (*)(const Tensor& query_, const Tensor& key, const Tensor& value,
+        const std::optional<Tensor>& attn_mask_, double dropout_p, bool is_causal, std::optional<double> scale, bool enable_gqa);
+
+DECLARE_DISPATCH(fused_sdp_choice_fn, _fused_sdp_choice_stub);
+
+TORCH_API Tensor bmm_nt(const Tensor& a, const Tensor& b);
+TORCH_API Tensor masked_softmax(
+    Tensor& attn_scores,
+    std::optional<Tensor> attn_mask,
+    const Tensor& query,
+    std::optional<int64_t> mask_type = {});
+
+using transform_bias_rescale_qkv_fn = void(*)(
+    at::ScalarType type,
+    void* _q_k_v,
+    const void* _qkv,
+    const void* _qkv_bias,
+    int64_t B,
+    int64_t T,
+    int64_t D,
+    int64_t num_head);
+
+DECLARE_DISPATCH(transform_bias_rescale_qkv_fn, transform_bias_rescale_qkv_stub);
+
+TORCH_API Tensor transform0213_gemm_nt_bias(
+    const Tensor& a,
+    const Tensor& b,
+    const Tensor& c,
+    const Tensor& query);
+
+TORCH_API Tensor bmm_nn(Tensor& out, const Tensor& a, const Tensor& b);
+
+TORCH_API void debug_assert_shape(int line, const Tensor& t, c10::IntArrayRef shape);
+
+TORCH_API Tensor qkv_projection(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const int64_t embed_dim,
+    const Tensor& qkv_weight);
+
+using flash_attention_fn = void (*)(
+    const Tensor& output, const Tensor& logsumexp,
+    const Tensor& query, const Tensor& key, const Tensor& value,
+    double dropout_p, bool is_causal,
+    std::optional<Tensor> attn_mask,
+    std::optional<double> scale);
+
+using flash_attention_backward_fn = void (*)(
+    const Tensor& grad_q, const Tensor& grad_k,
+    const Tensor& grad_v, const Tensor& grad_out,
+    const Tensor& query, const Tensor& key,
+    const Tensor& value, const Tensor& out, const Tensor& logsumexp,
+    double dropout_p, bool is_causal,
+    std::optional<Tensor> attn_mask,
+    std::optional<double> scale);
+
+DECLARE_DISPATCH(flash_attention_fn, flash_attention_kernel);
+DECLARE_DISPATCH(flash_attention_backward_fn, flash_attention_backward_kernel);
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h
new file mode 100644
index 0000000000000000000000000000000000000000..272700392e1d71e73788dd03fc66e14153040937
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h
@@ -0,0 +1,566 @@
+#pragma once
+#include <ATen/Context.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/TensorSubclassLikeUtils.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/grad_mode.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/ScalarType.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/env.h>
+#include <c10/util/irange.h>
+
+#include <c10/core/SymInt.h>
+#include <c10/core/SymFloat.h>
+#include <c10/util/string_view.h>
+#include <c10/util/Array.h>
+#include <cmath>
+#include <cstdint>
+#include <functional>
+
+namespace sdp {
+
+constexpr int32_t num_backends = 5;
+enum class SDPBackend {
+  error = -1,
+  math = 0,
+  flash_attention = 1,
+  efficient_attention = 2,
+  cudnn_attention = 3,
+  overrideable = 4
+};
+
+// Note that if this changed make sure to update
+// the templated enum in mem_eff/kernel_forward.h and mem_eff/kernel_backward.h
+enum class CustomMaskType {
+  NoCustomMask = 0,
+  CausalFromTopLeft = 1,
+  CausalFromBottomRight = 2,
+  NumCustomMaskTypes,
+};
+
+struct sdp_params {
+  at::Tensor query;
+  at::Tensor key;
+  at::Tensor value;
+  std::optional<at::Tensor> attn_mask;
+  double dropout;
+  bool is_causal;
+  bool enable_gqa;
+};
+
+SDPBackend select_sdp_backend_cpp(sdp_params const& kernel_params);
+
+inline c10::SymFloat calculate_scale(
+    const at::Tensor& query,
+    std::optional<double> scale) {
+  const auto softmax_scale = scale.has_value()
+      ? scale.value()
+      : (c10::SymFloat(1.0) / (c10::SymFloat(query.sym_size(-1)).sqrt()));
+  return c10::SymFloat(softmax_scale);
+}
+
+using c10::array_of;
+
+inline bool input_requires_grad(sdp_params const& params) {
+  const bool any_inputs_require_grad = params.query.requires_grad() ||
+      params.key.requires_grad() || params.value.requires_grad();
+  const bool gradmode_enabled = at::GradMode::is_enabled();
+  return any_inputs_require_grad && gradmode_enabled;
+}
+
+inline bool has_for_nested_inputs(sdp_params const& params) {
+  return
+      (params.query.is_nested() && params.query.layout() == c10::kStrided) ||
+      (params.key.is_nested() && params.key.layout() == c10::kStrided) ||
+      (params.value.is_nested() && params.value.layout() == c10::kStrided);
+}
+
+inline bool has_for_dense_inputs(sdp_params const& params) {
+  return !params.query.is_nested() || !params.key.is_nested() || !params.value.is_nested();
+}
+
+inline bool has_only_dense_inputs(sdp_params const& params) {
+  return !params.query.is_nested() && !params.key.is_nested() && !params.value.is_nested();
+}
+
+template <typename dtype_vector>
+inline bool check_tensor_dtype(
+    sdp_params const& params,
+    dtype_vector allowed_dtypes,
+    bool debug) {
+  auto query_dtype = params.query.dtype();
+  if (!(query_dtype == params.key.dtype() &&
+        query_dtype == params.value.dtype() &&
+        (std::find(allowed_dtypes.begin(), allowed_dtypes.end(), query_dtype) !=
+         allowed_dtypes.end()))) {
+    if (debug) {
+      TORCH_WARN(
+          "Expected query, key and value to all be of dtype: {",
+          c10::Join(", ", allowed_dtypes),
+          "}. Got ",
+          "Query dtype: ",
+          params.query.dtype(),
+          ", Key dtype: ",
+          params.key.dtype(),
+          ", and Value dtype: ",
+          params.value.dtype(),
+          " instead.");
+    }
+    return false;
+  }
+  return true;
+}
+
+
+inline bool try_broadcast_param_size(
+    const c10::SymInt q_size,
+    const c10::SymInt k_size,
+    const c10::SymInt v_size,
+    c10::string_view param_name,
+    bool debug) {
+  auto max_size = std::max({q_size, k_size, v_size});
+  if ((q_size != max_size && q_size != 1) ||
+      (k_size != max_size && k_size != 1) ||
+      (v_size != max_size && v_size != 1)) {
+    if (debug) {
+      TORCH_WARN(
+          "Both fused kernels require query, key and value to have broadcastable ",
+          param_name,
+          "got Query ",
+          param_name,
+          q_size,
+          ", Key ",
+          param_name,
+          k_size,
+          ", Value ",
+          param_name,
+          v_size,
+          " instead.");
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper(
+    at::Tensor const& param,
+    c10::string_view param_name,
+    bool debug) {
+  const auto nt_tensor_impl = at::native::get_nested_tensor_impl(param);
+  const at::Tensor& sizes = nt_tensor_impl->get_nested_sizes();
+  auto num_head_dims = nt_tensor_impl->opt_size(1);
+  if (!num_head_dims.has_value()) {
+    // num_head_dims is ragged
+    if (debug) {
+      TORCH_WARN(
+          "Fused kernels do not support ragged num_head_dims, ",
+          param_name,
+          "has a ragged num_heads.");
+    }
+    return false;
+  }
+
+  auto* sizes_ptr = sizes.data_ptr<int64_t>();
+  const int64_t n_tensors = param.size(0);
+  const int64_t size_tensor_stride = sizes.stride(0);
+
+  // This is being called inside sdp with shape [batch, heads, {seq_len}, dim]
+  for (const auto i : c10::irange(n_tensors)) {
+    if (sizes_ptr[(i * size_tensor_stride) + 1] == 0) {
+      if (debug) {
+        TORCH_WARN(
+            "Fused kernels do not support seq_len == 0, ",
+            param_name,
+            "has a seq len of 0.");
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool check_for_seq_len_0_nested_tensor(sdp_params const& params, bool debug) {
+  // When this function is called we are assured that the nt is dim==4
+  bool q_is_safe = params.query.is_nested()
+      ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper(
+            params.query, "query ", debug)
+      : true;
+  // short circuit if any is unsafe
+  if (!q_is_safe) {
+    return false;
+  }
+
+  bool k_is_safe = params.key.is_nested()
+      ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper(
+            params.key, "key ", debug)
+      : true;
+  if (!k_is_safe) {
+    return false;
+  }
+
+  bool v_is_safe = params.value.is_nested()
+      ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper(
+            params.value, "value ", debug)
+      : true;
+  if (!v_is_safe) {
+    return false;
+  }
+
+  // We now know none of the inputs have ragged num_heads, so we can safely
+  // access .size(1)
+  auto q_num_heads = params.query.size(1);
+  auto k_num_heads = params.key.size(1);
+  auto v_num_heads = params.value.size(1);
+  bool same_num_heads =
+      q_num_heads == k_num_heads && q_num_heads == v_num_heads;
+
+  if (!same_num_heads) {
+    if (input_requires_grad(params)){
+      if (debug) {
+        TORCH_WARN(
+              "Both fused kernels do not support training with broadcasted NT inputs.");
+      }
+      return false;
+    }
+    return try_broadcast_param_size(
+        q_num_heads, k_num_heads, v_num_heads, "num heads ", debug);
+  }
+
+  return true;
+}
+
+inline bool check_nested_tensor(sdp_params const& params, bool debug) {
+  // Return false if have nested tensor
+  if (!has_only_dense_inputs(params)) {
+    if (debug) {
+      TORCH_WARN(
+          "Both fused kernels of cpp version currently do not support Nested Tensor inputs.");
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_for_dropout(sdp_params const& params, bool debug) {
+  if (params.dropout > 0.0) {
+    if (debug) {
+      TORCH_WARN("Both fused kernels do not support non-zero dropout.");
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_requires_grad_and_nested(sdp_params const& params, bool debug) {
+  if (input_requires_grad(params)) {
+    if (debug) {
+      TORCH_WARN(
+          "Memory efficient attention currently doesn't support training with NT inputs.");
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_for_attn_mask(sdp_params const& params, bool debug) {
+  if (params.attn_mask.has_value()) {
+    if (debug) {
+      TORCH_WARN("Flash Attention does not support non-null attn_mask.");
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_attn_mask_shape(sdp_params const& params, bool debug) {
+  auto attn_mask = params.attn_mask;
+  if (!attn_mask.has_value()) {
+    return true;
+  }
+  if (attn_mask.value().requires_grad()) {
+    return false;
+  }
+  auto batchSize = params.query.sym_size(0);
+  auto qSize = params.query.sym_size(2);
+  auto kvSize = params.key.sym_size(2);
+  auto num_head = params.query.sym_size(1);
+  if (attn_mask.value().sym_size(-2) != qSize && attn_mask.value().sym_size(-2) != 1) {
+    return false;
+  }
+  if (attn_mask.value().sym_size(-1) != kvSize && attn_mask.value().sym_size(-1) != 1) {
+    return false;
+  }
+  if (attn_mask.value().dim() == 2) {
+    return true;
+  } else if (attn_mask.value().dim() == 4) {
+    if ((attn_mask.value().sym_size(0) == 1 || attn_mask.value().sym_size(0) == batchSize)
+        && (attn_mask.value().sym_size(1) == 1 || attn_mask.value().sym_size(1) == num_head)) {
+      return true;
+    }
+  }
+  if (debug) {
+    TORCH_WARN("Please use the following attn mask shapes: ",
+        "2d - ({Q_seq_len, 1}  x {KV_seq_len, 1}); ",
+        "4d - ({Batch, 1} x {Num_heads, 1} x {Q_seq_len, 1}  x {KV_seq_len, 1})");
+  }
+  return false;
+}
+
+inline bool check_tensor_shapes(sdp_params const& params, bool debug) {
+  auto query_dim = params.query.dim();
+  if (!(query_dim == params.key.dim() && query_dim == params.value.dim() &&
+        (query_dim == 4))) {
+    if (debug) {
+      TORCH_WARN(
+          "All fused kernels requires query, key and value to be 4 dimensional, but got Query dim: ",
+          query_dim,
+          ", Key dim: ",
+          params.key.dim(),
+          ", Value dim: ",
+          params.value.dim(),
+          " instead.");
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_safe_kv_broadcast(at::Tensor const& param, bool debug) {
+  const auto nt_tensor_impl = at::native::get_nested_tensor_impl(param);
+  auto seq_len = nt_tensor_impl->opt_size(2);
+  if (!seq_len.has_value()) {
+    if (debug) {
+      TORCH_WARN(
+          "For both fused kernels, if one of key/value batch_size requires "
+          "broadcasting and the other does not, then the other must have a ",
+          "consistent seq_len dim.")
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_grouped_query_attention(sdp_params const& params, bool debug) {
+  const auto q_num_heads = params.query.sym_size(-3);
+  const auto k_num_heads = params.key.sym_size(-3);
+  const auto v_num_heads = params.value.sym_size(-3);
+  const bool same_kv_heads = k_num_heads == v_num_heads;
+
+  if (!(same_kv_heads)){
+    if (debug) {
+      TORCH_WARN(
+          "Both fused kernels require key and value to have the same num_heads and batch_size but got: ",
+          "Key sizes: ",
+          params.key.sizes(),
+          ", Value sizes: ",
+          params.value.sizes(),
+          ", Query sizes: ",
+          params.query.sizes(),
+          " instead.");
+    }
+    return false;
+  }
+  // Check if grouped query attention is supported and validate the number of
+  // heads
+  if (q_num_heads % k_num_heads != 0) {
+    if (debug) {
+      TORCH_WARN(
+          "FlashAttentionV2 only supports grouped query attention, where the number of heads in key/value must divide number of heads in query.",
+          "Got input Key sizes(): ",
+          params.key.sym_size(-3),
+          ", Value sizes(): ",
+          params.value.sym_size(-3),
+          ", Query sizes(): ",
+          params.query.sym_size(-3),
+          " instead.");
+    }
+    return false;
+  }
+  return true;
+}
+
+template <bool supports_gqa>
+inline bool check_batch_size_and_num_heads_dense(sdp_params const& params, bool debug) {
+  // This is expected to be called after check_tensor_shapes ensuring that the
+  // size() calls won't error since the inputs are all 4 dimensional
+
+  auto q_batch_size = params.query.sym_size(0);
+  auto k_batch_size = params.key.sym_size(0);
+  auto v_batch_size = params.value.sym_size(0);
+
+  bool same_batch_size =
+      q_batch_size == k_batch_size && q_batch_size == v_batch_size;
+
+  auto q_num_heads = params.query.sym_size(-3);
+  auto k_num_heads = params.key.sym_size(-3);
+  auto v_num_heads = params.value.sym_size(-3);
+
+  bool same_num_heads =
+      q_num_heads == k_num_heads && q_num_heads == v_num_heads;
+
+  if (!same_batch_size){
+    if(debug) {
+      TORCH_WARN(
+          "For dense inputs, both fused kernels require query, key and value to have the same batch_size. ",
+          "Query.sizes(): ",
+          params.query.sizes(),
+          ", Key.sizes(): ",
+          params.key.sizes(),
+          ", Value.sizes(): ",
+          params.value.sizes(),
+          " instead. To broadcast dense inputs, try using unsqueeze and expand_to before passing them into the kernel.");
+    }
+    return false;
+  }
+
+  if(params.enable_gqa && supports_gqa){
+    return check_grouped_query_attention(params, debug);
+  }
+
+  if (!same_num_heads){
+    if (debug) {
+      TORCH_WARN(
+          "For dense input, both fused kernels require query, key and value to have the same num_heads. ",
+          "Query.sizes(): ",
+          params.query.sizes(),
+          ", Key sizes(): ",
+          params.key.sizes(),
+          ", Value sizes(): ",
+          params.value.sizes(),
+          " instead. To broadcast dense inputs, try using unsqueeze and expand_to before passing them into the kernel.");
+    }
+    return false;
+  }
+  // If all checks pass, return true
+  return true;
+}
+
+inline bool check_batch_size_nested(sdp_params const& params, bool debug) {
+  // This is expected to be called after check_tensor_shapes ensuring that the
+  // size() calls won't error since the inputs are all 4 dimensional
+  auto q_batch_size = params.query.sym_size(0);
+  auto k_batch_size = params.key.sym_size(0);
+  auto v_batch_size = params.value.sym_size(0);
+
+  bool same_batch_size =
+      q_batch_size == k_batch_size && q_batch_size == v_batch_size;
+
+  // num_heads logic for nested input is checked in
+  // check_for_seq_len_0_nested_tensor as there is handling there to make sure
+  // num_heads is not ragged
+  bool broadcastable_batch_size = true;
+  if (!same_batch_size) {
+    if (input_requires_grad(params)){
+      if (debug) {
+        TORCH_WARN(
+            "Both fused kernels do not support training with broadcasted NT inputs.");
+      }
+      return false;
+    }
+    // try to broadcast batchsize
+    broadcastable_batch_size = try_broadcast_param_size(
+        q_batch_size, k_batch_size, v_batch_size, "batch size ", debug);
+
+    // if only one of k or v require broadcasting of batch size, the other
+    // must have a consistent seq_len dim
+    if (broadcastable_batch_size) {
+      if (k_batch_size == 1 && v_batch_size != 1 &&
+          !check_safe_kv_broadcast(params.value, debug)) {
+        return false;
+      }
+      if (v_batch_size == 1 && k_batch_size != 1 &&
+          !check_safe_kv_broadcast(params.key, debug)) {
+        return false;
+      }
+    }
+  }
+  return broadcastable_batch_size;
+}
+
+inline bool check_nonzero_sequence_lengths_dense(sdp_params const& params, bool debug) {
+  // In some cases people will pass in 0 sized tensors, this will
+  // cause the fused path to error with unaligned mask
+  bool zero_seq_len_q = params.query.sym_size(-2) == 0;
+  bool zero_seq_len_k = params.key.sym_size(-2) == 0;
+  if (zero_seq_len_q || zero_seq_len_k) {
+    if (debug) {
+      TORCH_WARN(
+          "All fused kernels do not support zero seq_len_q or seq_len_kv.");
+    }
+    return false;
+  }
+  return true;
+}
+
+template<bool ignore_singleton_dim>
+inline bool check_last_dim_stride_equals_1_dense(sdp_params const& params, bool debug) {
+  // The stride checking for NestedTensors is done within the kernel
+  // And .contiguous will be called if needed
+
+  // This function checks that the last dimension of the inputs to
+  // fused_attention have stride 1
+  bool qkv_strides_equal_1 = params.query.sym_stride(-1) == 1 &&
+      params.key.sym_stride(-1) == 1 && params.value.sym_stride(-1) == 1;
+
+  // https://github.com/pytorch/pytorch/issues/116333
+  // If the head_dim is size 1 the stride won't matter, but we
+  // check this condition before padding the head_dim to 1
+  if (ignore_singleton_dim){
+    qkv_strides_equal_1 = qkv_strides_equal_1 || params.query.sym_size(-1) == 1;
+  }
+  bool mask_stride_equal_1 = params.attn_mask.has_value()
+      ? params.attn_mask.value().sym_stride(-1) == 1
+      : true;
+  if (!(qkv_strides_equal_1 && mask_stride_equal_1)) {
+    if (debug) {
+      std::ostringstream epilogue_message;
+      if (params.attn_mask.has_value()) {
+        epilogue_message << ", Attn_mask.stride(-1): "
+                         << params.attn_mask.value().sym_stride(-1);
+      }
+      epilogue_message << " instead.";
+      TORCH_WARN(
+          "All fused kernels require the last dimension of the input to have stride 1. ",
+          "Got Query.stride(-1): ",
+          params.query.sym_stride(-1),
+          ", Key.stride(-1): ",
+          params.key.sym_stride(-1),
+          ", Value.stride(-1): ",
+          params.value.sym_stride(-1),
+          epilogue_message.str());
+    }
+
+    return false;
+  }
+  return true;
+}
+
+inline bool check_runtime_disabled_flash(sdp_params const& params, bool debug) {
+  // We check the global context to see if user has explicitly turned of flash
+  // sdp kernels
+  if (!at::globalContext().userEnabledFlashSDP()) {
+    if (debug) {
+      TORCH_WARN("Flash attention has been runtime disabled.");
+    }
+    return false;
+  }
+  return true;
+}
+
+inline bool check_runtime_disabled_mem_efficient(sdp_params const& params, bool debug) {
+  // We check the global context to see if user has explicitly turned of
+  // mem_efficient sdp kernels
+  if (!at::globalContext().userEnabledMemEfficientSDP()) {
+    if (debug) {
+      TORCH_WARN("Memory Efficient attention has been runtime disabled.");
+    }
+    return false;
+  }
+  return true;
+}
+
+
+} // namespace sdp
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/Factory.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/Factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0302417cdce06523958c9e7c66c08b54aa18410
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/Factory.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+
+namespace at {
+namespace native {
+namespace mobile {
+
+Tensor allocate_padded_contiguous_if_needed(
+    const Tensor& input,
+    c10::MemoryFormat memory_format);
+
+// TODO: Remove this function when at::native::empty() is modified to accept a
+// custom memory allocator.
+
+at::Tensor empty_with_tail_padding(
+    IntArrayRef size,
+    const caffe2::TypeMeta dtype,
+    c10::MemoryFormat memory_format,
+    std::optional<DimnameList> maybe_names);
+
+} // namespace mobile
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..adb5f1cfa49f9726db5a9304b2546b1ceff52eb3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamUtils.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <vector>
+
+namespace at {
+namespace native {
+
+template <typename T>
+inline std::vector<T> _expand_param_if_needed(
+    ArrayRef<T> list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  if (list_param.size() == 1) {
+    return std::vector<T>(expected_dim, list_param[0]);
+  } else if ((int64_t)list_param.size() != expected_dim) {
+    std::ostringstream ss;
+    ss << "expected " << param_name << " to be a single integer value or a "
+       << "list of " << expected_dim << " values to match the convolution "
+       << "dimensions, but got " << param_name << "=" << list_param;
+    AT_ERROR(ss.str());
+  } else {
+    return list_param.vec();
+  }
+}
+
+inline std::vector<int64_t> expand_param_if_needed(
+    IntArrayRef list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  return _expand_param_if_needed(list_param, param_name, expected_dim);
+}
+
+inline std::vector<c10::SymInt> expand_param_if_needed(
+    SymIntArrayRef list_param,
+    const char* param_name,
+    int64_t expected_dim) {
+  return _expand_param_if_needed(list_param, param_name, expected_dim);
+}
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamsHash.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamsHash.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b7894cb8549f59d34b9b52d660780b729ada575
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamsHash.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <memory>
+#include <mutex>
+
+namespace at::native {
+
+// Hashing machinery for Params
+// Fowler–Noll–Vo hash function
+// see
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+template <typename Params>
+struct ParamsHash {
+  // Params must be a POD because we read out its memory
+  // contents as char* when hashing
+  static_assert(std::is_standard_layout_v<Params>, "Params is not POD");
+
+  size_t operator()(const Params& params) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&params);
+    uint32_t value = 0x811C9DC5;
+    for (const auto i : c10::irange(sizeof(Params))) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return (size_t)value;
+  }
+};
+
+template <typename Params>
+struct ParamsEqual {
+  // Params must be a POD because we read out its memory
+  // contents as char* when comparing
+  static_assert(std::is_standard_layout_v<Params>, "Params is not POD");
+
+  bool operator()(const Params& a, const Params& b) const {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
+    return memcmp(ptr1, ptr2, sizeof(Params)) == 0;
+  }
+};
+
+// Provide explicit byte-for-byte constructors to avoid uwittingly leaving
+// padding bytes unitialized (e.g., when passing Params by value)
+template <typename T>
+struct ParamsWrapper {
+  T pod;
+  static_assert(
+      std::is_standard_layout_v<T>,
+      "ParamsWrapper cannot wrap non-POD data");
+
+  ParamsWrapper() {
+    memset(&(this->pod), 0, sizeof(this->pod));
+  }
+
+  ParamsWrapper(const ParamsWrapper& other) {
+    memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
+  }
+
+  ParamsWrapper(ParamsWrapper&& other) noexcept {
+    memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
+  }
+
+  ParamsWrapper& operator=(const ParamsWrapper& other) {
+    memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
+    return *this;
+  }
+
+  ParamsWrapper& operator=(ParamsWrapper&& other) noexcept {
+    memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
+    return *this;
+  }
+
+  inline friend bool operator==(
+      const ParamsWrapper& lhs,
+      const ParamsWrapper& rhs) noexcept {
+    auto ptr1 = reinterpret_cast<const uint8_t*>(&(lhs.pod));
+    auto ptr2 = reinterpret_cast<const uint8_t*>(&(rhs.pod));
+    return memcmp(ptr1, ptr2, sizeof(lhs.pod)) == 0;
+  }
+};
+
+// Wrapped version: this allows the outer struct to have custom copy and move
+// constructors for additional safety
+template <typename ParamsWrapper>
+struct ParamsWrapperHash {
+  // Params must be a POD because we read out its memory
+  // contents as char* when hashing
+  static_assert(
+      std::is_standard_layout_v<decltype(ParamsWrapper::pod)>,
+      "ParamsWrapper cannot wrap non-POD data");
+
+  size_t operator()(const ParamsWrapper& params_wrapper) const {
+    auto ptr = reinterpret_cast<const uint8_t*>(&(params_wrapper.pod));
+    uint32_t value = 0x811C9DC5;
+    for (const auto i : c10::irange(sizeof(params_wrapper.pod))) {
+      value ^= ptr[i];
+      value *= 0x01000193;
+    }
+    return (size_t)value;
+  }
+};
+
+} // namespace at::native
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized.h
new file mode 100644
index 0000000000000000000000000000000000000000..038954ed0ba84d923b0e6af82f2c64b23239084e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized.h
@@ -0,0 +1,113 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/_empty_per_channel_affine_quantized_ops.h>
+
+namespace at {
+
+
+// aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+inline at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, ::std::optional<at::MemoryFormat> memory_format=c10::MemoryFormat::Contiguous) {
+    return at::_ops::_empty_per_channel_affine_quantized::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, ::std::optional<at::MemoryFormat> memory_format=c10::MemoryFormat::Contiguous) {
+    return at::_ops::_empty_per_channel_affine_quantized::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+  }
+}
+
+// aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+inline at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, ::std::optional<at::MemoryFormat> memory_format) {
+    return at::_ops::_empty_per_channel_affine_quantized::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, ::std::optional<at::MemoryFormat> memory_format) {
+    return at::_ops::_empty_per_channel_affine_quantized::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format);
+  }
+}
+
+// aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+inline at::Tensor _empty_per_channel_affine_quantized_symint(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, ::std::optional<at::MemoryFormat> memory_format=c10::MemoryFormat::Contiguous) {
+    return at::_ops::_empty_per_channel_affine_quantized::call(size, scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor _empty_per_channel_affine_quantized(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, ::std::optional<at::MemoryFormat> memory_format=c10::MemoryFormat::Contiguous) {
+    return at::_ops::_empty_per_channel_affine_quantized::call(size, scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+  }
+}
+
+// aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor
+inline at::Tensor _empty_per_channel_affine_quantized_symint(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, ::std::optional<at::MemoryFormat> memory_format) {
+    return at::_ops::_empty_per_channel_affine_quantized::call(size, scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor _empty_per_channel_affine_quantized(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, ::std::optional<at::MemoryFormat> memory_format) {
+    return at::_ops::_empty_per_channel_affine_quantized::call(size, scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format);
+  }
+}
+
+// aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _empty_per_channel_affine_quantized_out(at::Tensor & out, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::MemoryFormat> memory_format=c10::MemoryFormat::Contiguous) {
+    return at::_ops::_empty_per_channel_affine_quantized_out::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & _empty_per_channel_affine_quantized_out(at::Tensor & out, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::MemoryFormat> memory_format=c10::MemoryFormat::Contiguous) {
+    return at::_ops::_empty_per_channel_affine_quantized_out::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out);
+  }
+}
+
+// aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _empty_per_channel_affine_quantized_outf(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+    return at::_ops::_empty_per_channel_affine_quantized_out::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & _empty_per_channel_affine_quantized_outf(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+    return at::_ops::_empty_per_channel_affine_quantized_out::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out);
+  }
+}
+
+// aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _empty_per_channel_affine_quantized_symint_out(at::Tensor & out, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::MemoryFormat> memory_format=c10::MemoryFormat::Contiguous) {
+    return at::_ops::_empty_per_channel_affine_quantized_out::call(size, scales, zero_points, axis, memory_format, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & _empty_per_channel_affine_quantized_out(at::Tensor & out, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::MemoryFormat> memory_format=c10::MemoryFormat::Contiguous) {
+    return at::_ops::_empty_per_channel_affine_quantized_out::call(size, scales, zero_points, axis, memory_format, out);
+  }
+}
+
+// aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _empty_per_channel_affine_quantized_symint_outf(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+    return at::_ops::_empty_per_channel_affine_quantized_out::call(size, scales, zero_points, axis, memory_format, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & _empty_per_channel_affine_quantized_outf(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+    return at::_ops::_empty_per_channel_affine_quantized_out::call(size, scales, zero_points, axis, memory_format, out);
+  }
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcmul_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcmul_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..0be9636383b059f1a7ae02294033307ca0f4be9b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcmul_native.h
@@ -0,0 +1,35 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_slow(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1);
+TORCH_API void _foreach_addcmul_Scalar_out(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value, at::TensorList out);
+TORCH_API void foreach_tensor_addcmul_scalar_slow_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1);
+TORCH_API ::std::vector<at::Tensor> foreach_tensor_addcmul_scalar_cuda(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1);
+TORCH_API void foreach_tensor_addcmul_scalar_cuda_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1);
+TORCH_API ::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_slow(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars);
+TORCH_API void _foreach_addcmul_ScalarList_out(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars, at::TensorList out);
+TORCH_API void foreach_tensor_addcmul_scalarlist_slow_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars);
+TORCH_API ::std::vector<at::Tensor> foreach_tensor_addcmul_scalarlist_cuda(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars);
+TORCH_API void foreach_tensor_addcmul_scalarlist_cuda_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars);
+TORCH_API ::std::vector<at::Tensor> foreach_tensor_addcmul_tensor_slow(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars);
+TORCH_API void _foreach_addcmul_Tensor_out(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars, at::TensorList out);
+TORCH_API void foreach_tensor_addcmul_tensor_slow_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars);
+TORCH_API ::std::vector<at::Tensor> foreach_tensor_addcmul_tensor_cuda(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars);
+TORCH_API void foreach_tensor_addcmul_tensor_cuda_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..23021887b559d20b51da2b961b21b210c14563c6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _foreach_expm1 {
+  using schema = ::std::vector<at::Tensor> (at::TensorList);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_expm1")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_expm1(Tensor[] self) -> Tensor[]")
+  static ::std::vector<at::Tensor> call(at::TensorList self);
+  static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self);
+};
+
+struct TORCH_API _foreach_expm1_ {
+  using schema = void (at::TensorList);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_expm1_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_expm1_(Tensor(a!)[] self) -> ()")
+  static void call(at::TensorList self);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self);
+};
+
+struct TORCH_API _foreach_expm1_out {
+  using schema = void (at::TensorList, at::TensorList);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_expm1")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_expm1.out(Tensor[] self, *, Tensor(a!)[] out) -> ()")
+  static void call(at::TensorList self, at::TensorList out);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..420e3703d34a14b61e41a6251b2e8b84feb0580d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _sparse_mask_projection {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_mask_projection")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches);
+};
+
+struct TORCH_API _sparse_mask_projection_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_mask_projection")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_mask_projection.out(Tensor self, Tensor mask, bool accumulate_matches=False, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..57a5999e055083a5aebe599ac25fa68736d3e767
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _sparse_sum_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_sum_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor")
+  static at::Tensor call(const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim);
+};
+
+struct TORCH_API _sparse_sum_backward_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_sum_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_sum_backward.out(Tensor grad, Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f9dcd4e1ed0ddf5157fc17f67554ef0cf9a6d69
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _test_autograd_multiple_dispatch_view_copy {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_autograd_multiple_dispatch_view_copy")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API _test_autograd_multiple_dispatch_view_copy_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_autograd_multiple_dispatch_view_copy")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_autograd_multiple_dispatch_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..c797b98f0e9d84cbe1c88fd0875f1c094b9fc521
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _unsafe_masked_index_put_accumulate {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::List<::std::optional<at::Tensor>> &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_unsafe_masked_index_put_accumulate")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_unsafe_masked_index_put_accumulate(Tensor self, Tensor mask, Tensor?[] indices, Tensor values) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional<at::Tensor>> & indices, const at::Tensor & values);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional<at::Tensor>> & indices, const at::Tensor & values);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/alias_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/alias_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..c70954c94dbbb9e2793741fefaf1919b021dd8c3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/alias_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API alias {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::alias")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "alias(Tensor(a) self) -> Tensor(a)")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e2987cc5ce81e8d6764dfb6fa3951b257eaeed9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor arcsinh(const at::Tensor & self);
+TORCH_API at::Tensor & arcsinh_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & arcsinh_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & arcsinh_(at::Tensor & self);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/block_diag_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/block_diag_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1b76be9646bf755eb896eb79f8efde2ed3a665c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/block_diag_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor block_diag(at::TensorList tensors);
+TORCH_API at::Tensor & block_diag_out(at::TensorList tensors, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeexplicitautogradnonfunctional_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e504a5eaf81ddf258dd7682f86be0f21731d5c2e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor cat(const at::ITensorListRef & tensors, int64_t dim=0);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeimplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3427483dc742ceb26cf35ae0c68792a51bcf6a7b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor cat(at::TensorList tensors, at::Dimname dim);
+TORCH_API at::Tensor & cat_out(at::Tensor & out, at::TensorList tensors, at::Dimname dim);
+TORCH_API at::Tensor & cat_outf(at::TensorList tensors, at::Dimname dim, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..946edb16883ffccb46a7ecf3811f29ffa77664de
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor cudnn_grid_sampler(const at::Tensor & self, const at::Tensor & grid);
+
+} // namespace cuda
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag_embed_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag_embed_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..857a490cd2a051f30549884f1b728d0212aa18e3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag_embed_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & diag_embed_out(at::Tensor & out, const at::Tensor & self, int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1);
+TORCH_API at::Tensor & diag_embed_outf(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cb5f4ec837a8b97ce402b27cb4cb40b7ccc0903
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/fake_quantize_per_channel_affine_ops.h>
+
+namespace at {
+
+
+// aten::fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
+inline at::Tensor fake_quantize_per_channel_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max) {
+    return at::_ops::fake_quantize_per_channel_affine::call(self, scale, zero_point, axis, quant_min, quant_max);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fill_meta_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fill_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..97fe69bc140d0408365f51e546b2ebbc0350a031
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fill_meta_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor & fill_(at::Tensor & self, const at::Scalar & value);
+TORCH_API at::Tensor & fill_(at::Tensor & self, const at::Tensor & value);
+
+} // namespace meta
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frexp.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frexp.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1643f7b5fe635dc54e209a33b204c5be7dbffc1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frexp.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/frexp_ops.h>
+
+namespace at {
+
+
+// aten::frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
+inline ::std::tuple<at::Tensor,at::Tensor> frexp(const at::Tensor & self) {
+    return at::_ops::frexp_Tensor::call(self);
+}
+
+// aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent)
+inline ::std::tuple<at::Tensor &,at::Tensor &> frexp_out(at::Tensor & mantissa, at::Tensor & exponent, const at::Tensor & self) {
+    return at::_ops::frexp_Tensor_out::call(self, mantissa, exponent);
+}
+// aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent)
+inline ::std::tuple<at::Tensor &,at::Tensor &> frexp_outf(const at::Tensor & self, at::Tensor & mantissa, at::Tensor & exponent) {
+    return at::_ops::frexp_Tensor_out::call(self, mantissa, exponent);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu_backward.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f860cdd20e4aa57aa8794ba4e9fb618f7a3dc7c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu_backward.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/gelu_backward_ops.h>
+
+namespace at {
+
+
+// aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & gelu_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") {
+    return at::_ops::gelu_backward_grad_input::call(grad_output, self, approximate, grad_input);
+}
+// aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & gelu_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate, at::Tensor & grad_input) {
+    return at::_ops::gelu_backward_grad_input::call(grad_output, self, approximate, grad_input);
+}
+
+// aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
+inline at::Tensor gelu_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") {
+    return at::_ops::gelu_backward::call(grad_output, self, approximate);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c4c85ba0f0773c9cada314ffa34fd8d2822267b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_i0 : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f592d7baa8b30ee7a0acb6ed043d9c9f41532932
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API is_pinned {
+  using schema = bool (const at::Tensor &, ::std::optional<at::Device>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::is_pinned")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "is_pinned(Tensor self, Device? device=None) -> bool")
+  static bool call(const at::Tensor & self, ::std::optional<at::Device> device);
+  static bool redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional<at::Device> device);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lstm.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lstm.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d29fb9ea6594a8a01660326d3c482e0e687c779
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lstm.h
@@ -0,0 +1,35 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/lstm_ops.h>
+
+namespace at {
+
+
+// aten::lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> lstm(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) {
+    return at::_ops::lstm_input::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first);
+}
+
+// aten::lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> lstm(const at::Tensor & data, const at::Tensor & batch_sizes, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) {
+    return at::_ops::lstm_data::call(data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..e34eb9d752867f101a92e0ce22cb6ee1026ecb75
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/masked_scatter_ops.h>
+
+namespace at {
+
+
+// aten::masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
+inline at::Tensor masked_scatter(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source) {
+    return at::_ops::masked_scatter::call(self, mask, source);
+}
+
+// aten::masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & masked_scatter_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source) {
+    return at::_ops::masked_scatter_out::call(self, mask, source, out);
+}
+// aten::masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & masked_scatter_outf(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source, at::Tensor & out) {
+    return at::_ops::masked_scatter_out::call(self, mask, source, out);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..22df2ecabc6cf5319a069bcad8250ad1624ad254
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API multinomial_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, bool, ::std::optional<at::Generator>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multinomial")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, int64_t num_samples, bool replacement, ::std::optional<at::Generator> generator, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement, ::std::optional<at::Generator> generator, at::Tensor & out);
+};
+
+struct TORCH_API multinomial {
+  using schema = at::Tensor (const at::Tensor &, int64_t, bool, ::std::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multinomial")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, int64_t num_samples, bool replacement, ::std::optional<at::Generator> generator);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement, ::std::optional<at::Generator> generator);
+};
+
+}} // namespace at::_ops
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..64b0867f188b54f1bbcc7724d12cac678dbde6fb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor nested_to_padded_tensor(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=::std::nullopt);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_full.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_full.h
new file mode 100644
index 0000000000000000000000000000000000000000..b87e46547b31fdea76c67de4a72e898c65f9ff3c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_full.h
@@ -0,0 +1,97 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/new_full_ops.h>
+
+namespace at {
+
+
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor new_full(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) {
+    return at::_ops::new_full::call(self, c10::fromIntArrayRefSlow(size), fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor new_full(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory) {
+    return at::_ops::new_full::call(self, c10::fromIntArrayRefSlow(size), fill_value, dtype, layout, device, pin_memory);
+  }
+}
+
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor new_full(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) {
+    return at::_ops::new_full::call(self, size, fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor new_full(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory) {
+    return at::_ops::new_full::call(self, size, fill_value, dtype, layout, device, pin_memory);
+  }
+}
+
+// aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & new_full_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value) {
+    return at::_ops::new_full_out::call(self, c10::fromIntArrayRefSlow(size), fill_value, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & new_full_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value) {
+    return at::_ops::new_full_out::call(self, c10::fromIntArrayRefSlow(size), fill_value, out);
+  }
+}
+
+// aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & new_full_outf(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) {
+    return at::_ops::new_full_out::call(self, c10::fromIntArrayRefSlow(size), fill_value, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & new_full_outf(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) {
+    return at::_ops::new_full_out::call(self, c10::fromIntArrayRefSlow(size), fill_value, out);
+  }
+}
+
+// aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & new_full_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value) {
+    return at::_ops::new_full_out::call(self, size, fill_value, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & new_full_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value) {
+    return at::_ops::new_full_out::call(self, size, fill_value, out);
+  }
+}
+
+// aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & new_full_symint_outf(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) {
+    return at::_ops::new_full_out::call(self, size, fill_value, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & new_full_outf(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) {
+    return at::_ops::new_full_out::call(self, size, fill_value, out);
+  }
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cpu_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..85925396e31178f3d1ab049698d365e6e2d29333
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> nll_loss_forward(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> nll_loss_forward_symint(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_out(at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_outf(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, at::Tensor & output, at::Tensor & total_weight);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_symint_out(at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_symint_outf(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight);
+
+} // namespace cpu
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ones_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ones_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d83a3147201af3e69b736a4227ac0be2d2870ae
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ones_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,34 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor ones(at::IntArrayRef size, ::std::optional<at::DimnameList> names, at::TensorOptions options={});
+TORCH_API at::Tensor ones(at::IntArrayRef size, ::std::optional<at::DimnameList> names, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+TORCH_API at::Tensor & ones_out(at::Tensor & out, at::IntArrayRef size, ::std::optional<at::DimnameList> names);
+TORCH_API at::Tensor & ones_outf(at::IntArrayRef size, ::std::optional<at::DimnameList> names, at::Tensor & out);
+TORCH_API at::Tensor ones(at::IntArrayRef size, at::TensorOptions options={});
+TORCH_API at::Tensor ones(at::IntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+TORCH_API at::Tensor ones_symint(c10::SymIntArrayRef size, at::TensorOptions options={});
+TORCH_API at::Tensor ones_symint(c10::SymIntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+TORCH_API at::Tensor & ones_out(at::Tensor & out, at::IntArrayRef size);
+TORCH_API at::Tensor & ones_outf(at::IntArrayRef size, at::Tensor & out);
+TORCH_API at::Tensor & ones_symint_out(at::Tensor & out, c10::SymIntArrayRef size);
+TORCH_API at::Tensor & ones_symint_outf(c10::SymIntArrayRef size, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c6bcbfc97fb839cb850562c37df29668341c78c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h
@@ -0,0 +1,27 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor remainder(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & remainder_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & remainder_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & remainder_(at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor remainder(const at::Scalar & self, const at::Tensor & other);
+
+} // namespace cpu
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d.h
new file mode 100644
index 0000000000000000000000000000000000000000..de27b4f4722e228130ddcdf056e430e360577e77
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d.h
@@ -0,0 +1,91 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/replication_pad1d_ops.h>
+
+namespace at {
+
+
+// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) {
+    return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) {
+    return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out);
+  }
+}
+
+// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) {
+    return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) {
+    return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out);
+  }
+}
+
+// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & replication_pad1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) {
+    return at::_ops::replication_pad1d_out::call(self, padding, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) {
+    return at::_ops::replication_pad1d_out::call(self, padding, out);
+  }
+}
+
+// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & replication_pad1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) {
+    return at::_ops::replication_pad1d_out::call(self, padding, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & replication_pad1d_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) {
+    return at::_ops::replication_pad1d_out::call(self, padding, out);
+  }
+}
+
+// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+inline at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) {
+    return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding));
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) {
+    return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding));
+  }
+}
+
+// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor
+inline at::Tensor replication_pad1d_symint(const at::Tensor & self, c10::SymIntArrayRef padding) {
+    return at::_ops::replication_pad1d::call(self, padding);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor replication_pad1d(const at::Tensor & self, c10::SymIntArrayRef padding) {
+    return at::_ops::replication_pad1d::call(self, padding);
+  }
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rshift.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rshift.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e457a2885253bb6041bdd49ae34a4b1731452a2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rshift.h
@@ -0,0 +1,53 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/rshift_ops.h>
+
+namespace at {
+
+
+// aten::__rshift__.Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor __rshift__(const at::Tensor & self, const at::Scalar & other) {
+    return at::_ops::__rshift___Scalar::call(self, other);
+}
+
+// aten::__rshift__.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor __rshift__(const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::__rshift___Tensor::call(self, other);
+}
+
+// aten::__rshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & __rshift___out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+    return at::_ops::__rshift___Scalar_out::call(self, other, out);
+}
+// aten::__rshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & __rshift___outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+    return at::_ops::__rshift___Scalar_out::call(self, other, out);
+}
+
+// aten::__rshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & __rshift___out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::__rshift___Tensor_out::call(self, other, out);
+}
+// aten::__rshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & __rshift___outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+    return at::_ops::__rshift___Tensor_out::call(self, other, out);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..81a90e7aa3c11bbe1960038296a60d5468846192
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & select_copy_int_out_symint(const at::Tensor & self, int64_t dim, c10::SymInt index, at::Tensor & out);
+TORCH_API at::Tensor select_copy_sparse_csr(const at::Tensor & self, int64_t dim, int64_t index);
+TORCH_API at::Tensor select_copy_symint(const at::Tensor & self, int64_t dim, c10::SymInt index);
+} // namespace native
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8797587a5c47da989771e0fd3dd15b2cc2337ecb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_erfcx(const at::Tensor & self);
+TORCH_API at::Tensor & special_erfcx_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_erfcx_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices.h
new file mode 100644
index 0000000000000000000000000000000000000000..679abf2f9cd1a31e45bf27c3cd5382ebe47408d5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices.h
@@ -0,0 +1,43 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+
+
+#include <ATen/ops/tril_indices_ops.h>
+
+namespace at {
+
+
+// aten::tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong) {
+    return at::_ops::tril_indices::call(row, col, offset, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+// aten::tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory) {
+    return at::_ops::tril_indices::call(row, col, offset, dtype, layout, device, pin_memory);
+}
+
+// aten::tril_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & tril_indices_out(at::Tensor & out, int64_t row, int64_t col, int64_t offset=0) {
+    return at::_ops::tril_indices_out::call(row, col, offset, out);
+}
+// aten::tril_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & tril_indices_outf(int64_t row, int64_t col, int64_t offset, at::Tensor & out) {
+    return at::_ops::tril_indices_out::call(row, col, offset, out);
+}
+
+}
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..22378108e53f1463b36bbc19037c9a63e23a396e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong);
+TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+
+} // namespace cuda
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unique_dim_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unique_dim_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfd1ca3d1de705406ebb785410716dc9ea861d9d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unique_dim_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> unique_dim_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> unique_dim_outf(const at::Tensor & self, int64_t dim, bool sorted, bool return_inverse, bool return_counts, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..c413a2b881a99980203f6827ddc498d2fb48ece9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/upsample_linear1d_meta.h>
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor upsample_linear1d(const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, ::std::optional<at::ArrayRef<double>> scale_factors);
+struct TORCH_API structured_upsample_linear1d_out_cpu : public at::meta::structured_upsample_linear1d {
+void impl(const at::Tensor & self, at::ArrayRef<int64_t> output_size, bool align_corners, ::std::optional<double> scales, const at::Tensor & out);
+};
+struct TORCH_API structured_upsample_linear1d_out_cuda : public at::meta::structured_upsample_linear1d {
+void impl(const at::Tensor & self, at::ArrayRef<int64_t> output_size, bool align_corners, ::std::optional<double> scales, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at